This R script is used to validate the points in the ReSurvey database using RS indicators (indices + phenology + canopy height).
library(tidyverse)
── Attaching core tidyverse packages ────────────────────────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.2 ✔ tibble 3.2.1
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
✔ purrr 1.0.4
── Conflicts ──────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(here)
G3;here() starts at C:/Users/jimenezalfaro/OneDrive - Universidad de Oviedo/IMIB/Analyses/MOTIVATE/MOTIVATE_validation
g
library(gridExtra)
G3;
Adjuntando el paquete: ‘gridExtra’
gG3;The following object is masked from ‘package:dplyr’:
combine
g
library(readxl)
library(scales)
G3;
Adjuntando el paquete: ‘scales’
gG3;The following object is masked from ‘package:purrr’:
discard
gG3;The following object is masked from ‘package:readr’:
col_factor
g
library(sf)
G3;Linking to GEOS 3.13.1, GDAL 3.10.2, PROJ 9.5.1; sf_use_s2() is TRUE
g
library(rnaturalearth)
library(dtplyr)
G3;Registered S3 method overwritten by 'data.table':
method from
print.data.table
g
library(lme4)
G3;Cargando paquete requerido: Matrix
gG3;
Adjuntando el paquete: ‘Matrix’
gG3;The following objects are masked from ‘package:tidyr’:
expand, pack, unpack
g
library(lmerTest)
G3;
Adjuntando el paquete: ‘lmerTest’
gG3;The following object is masked from ‘package:lme4’:
lmer
gG3;The following object is masked from ‘package:stats’:
step
g
library(car)
G3;Cargando paquete requerido: carData
gG3;
Adjuntando el paquete: ‘car’
gG3;The following object is masked from ‘package:dplyr’:
recode
gG3;The following object is masked from ‘package:purrr’:
some
g
library(ggeffects)
library(party)
G3;Cargando paquete requerido: grid
gG3;Cargando paquete requerido: mvtnorm
gG3;Cargando paquete requerido: modeltools
gG3;Cargando paquete requerido: stats4
gG3;
Adjuntando el paquete: ‘modeltools’
gG3;The following object is masked from ‘package:car’:
Predict
gG3;The following object is masked from ‘package:lme4’:
refit
gG3;Cargando paquete requerido: strucchange
gG3;Cargando paquete requerido: zoo
gG3;
Adjuntando el paquete: ‘zoo’
gG3;The following objects are masked from ‘package:base’:
as.Date, as.Date.numeric
gG3;Cargando paquete requerido: sandwich
gG3;
Adjuntando el paquete: ‘strucchange’
gG3;The following object is masked from ‘package:stringr’:
boundary
gG3;
Adjuntando el paquete: ‘party’
gG3;The following object is masked from ‘package:dplyr’:
where
g
library(partykit)
G3;Cargando paquete requerido: libcoin
gG3;
Adjuntando el paquete: ‘partykit’
gG3;The following objects are masked from ‘package:party’:
cforest, ctree, ctree_control, edge_simple, mob, mob_control, node_barplot,
node_bivplot, node_boxplot, node_inner, node_surv, node_terminal, varimp
g
library(moreparty)
G3;Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
g
library(doParallel)
G3;Cargando paquete requerido: foreach
gG3;
Adjuntando el paquete: ‘foreach’
gG3;The following objects are masked from ‘package:purrr’:
accumulate, when
gG3;Cargando paquete requerido: iterators
gG3;Cargando paquete requerido: parallel
g
library(strucchange)
library(ggparty)
G3;
Adjuntando el paquete: ‘ggparty’
gG3;The following object is masked from ‘package:ggeffects’:
get_predictions
g
library(caret)
G3;Cargando paquete requerido: lattice
gG3;
Adjuntando el paquete: ‘caret’
gG3;The following object is masked from ‘package:purrr’:
lift
g
library(moreparty)
library(randomForest)
G3;randomForest 4.7-1.2
gG3;Type rfNews() to see new features/changes/bug fixes.
gG3;
Adjuntando el paquete: ‘randomForest’
gG3;The following object is masked from ‘package:gridExtra’:
combine
gG3;The following object is masked from ‘package:dplyr’:
combine
gG3;The following object is masked from ‘package:ggplot2’:
margin
g
library(pROC)
G3;Type 'citation("pROC")' for a citation.
gG3;
Adjuntando el paquete: ‘pROC’
gG3;The following objects are masked from ‘package:stats’:
cov, smooth, var
g
library(corrplot)
G3;corrplot 0.95 loaded
g
library(rlang)
G3;
Adjuntando el paquete: ‘rlang’
gG3;The following objects are masked from ‘package:purrr’:
%@%, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl, flatten_raw,
invoke, splice
g
library(stringr)
library(beepr)
library(foreach)
library(permimp)
library(yardstick)
G3;
Adjuntando el paquete: ‘yardstick’
gG3;The following objects are masked from ‘package:caret’:
precision, recall, sensitivity, specificity
gG3;The following object is masked from ‘package:readr’:
spec
g
printall <- function(tibble) {
print(tibble, width = Inf)
}
source("https://gist.githubusercontent.com/benmarwick/2a1bb0133ff568cbe28d/raw/fb53bd97121f7f9ce947837ef1a4c65a73bffb3f/geom_flat_violin.R")
# Define the folder path
folder_path <- here("objects", "RF")
# List all .RData or .rda files in the folder
rdata_files <- list.files(folder_path, full.names = TRUE)
# Load each file
lapply(rdata_files, load, envir = .GlobalEnv)
[[1]]
[1] "rf1"
[[2]]
[1] "predictions_rf1"
[[3]]
[1] "varimp_rf1"
[[4]]
[1] "rf10"
[[5]]
[1] "predictions_rf10"
[[6]]
[1] "varimp_rf10"
[[7]]
[1] "rf11"
[[8]]
[1] "predictions_rf11"
[[9]]
[1] "varimp_rf11"
[[10]]
[1] "rf12"
[[11]]
[1] "predictions_rf12"
[[12]]
[1] "varimp_rf12"
[[13]]
[1] "rf17"
[[14]]
[1] "predictions_rf17"
[[15]]
[1] "varimp_rf17"
[[16]]
[1] "rf18"
[[17]]
[1] "predictions_rf18"
[[18]]
[1] "varimp_rf18"
[[19]]
[1] "rf19"
[[20]]
[1] "predictions_rf19"
[[21]]
[1] "varimp_rf19"
[[22]]
[1] "rf2"
[[23]]
[1] "predictions_rf2"
[[24]]
[1] "varimp_rf2"
[[25]]
[1] "rf20"
[[26]]
[1] "predictions_rf20"
[[27]]
[1] "varimp_rf20"
[[28]]
[1] "rf21"
[[29]]
[1] "predictions_rf21"
[[30]]
[1] "rf23"
[[31]]
[1] "predictions_rf23"
[[32]]
[1] "rf3"
[[33]]
[1] "predictions_rf3"
[[34]]
[1] "varimp_rf3"
[[35]]
[1] "rf4"
[[36]]
[1] "predictions_rf4"
[[37]]
[1] "varimp_rf4"
[[38]]
[1] "rf9"
[[39]]
[1] "predictions_rf9"
[[40]]
[1] "varimp_rf9"
data_validation<-read_tsv(here(
"data", "clean","final_RS_data_bands_S2_all_20250804.csv"))
Rows: 32730 Columns: 137
── Column specification ──────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (7): EUNISa_1, EUNISa_1_descr, EUNISa_2, EUNISa_2_descr, biogeo, unit, Lctnmth
dbl (130): PlotObservationID, EVI_auc_mar_oct, EVI_auc_slope, EVI_auc_threshold, EVI_avg_value...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
No parsing issues!
Do when all RS data is ready!
# Define a function to create histograms
plot_histogram <- function(data, x_var, x_label) {
ggplot(data %>%
dplyr::filter(EUNISa_1 %in% c("T", "R", "S", "Q")),
aes(x = !!sym(x_var))) +
geom_histogram(color = "black", fill = "white") +
labs(x = x_label, y = "Frequency") +
theme_bw()
}
# Define a function to create plots with violin + boxplot + points
distr_plot <- function(data, y_vars, y_labels) {
for (i in seq_along(y_vars)) {
y_var <- y_vars[[i]]
y_label <- y_labels[[i]]
p <- ggplot(data = data %>%
dplyr::filter(EUNISa_1 %in% c("T", "R", "S", "Q")),
aes(x = EUNISa_1_descr, y = !!sym(y_var), fill = EUNISa_1_descr)) +
geom_flat_violin(position = position_nudge(x = 0.2, y = 0), alpha = 0.8) +
geom_point(aes(y = !!sym(y_var), color = EUNISa_1_descr),
position = position_jitter(width = 0.15), size = 1, alpha = 0.25) +
geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.5) +
stat_summary(fun.y = mean, geom = "point", shape = 20, size = 1) +
stat_summary(fun.data = function(x) data.frame(y = max(x) + 0.1,
label = length(x)),
geom = "text", aes(label = ..label..), vjust = 0.5) +
labs(y = y_label, x = "EUNIS level 1") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
guides(fill = FALSE, color = FALSE) +
theme_bw() + coord_flip()
print(p)
}
}
Ranges of min and max:
range(data_validation$NDVI_max, na.rm = T) # NDVI_max > 1 (slightly)
[1] 0.06064505 1.08505480
range(data_validation$NDMI_max, na.rm = T) # NDMI_max > 1 (slightly)
[1] -0.2202634 1.0960134
range(data_validation$NDWI_max, na.rm = T)
[1] -0.7291356 0.5942999
range(data_validation$SAVI_max, na.rm = T)
[1] 0.02386599 0.85981347
range(data_validation$EVI_max, na.rm = T) # EVI_max > 1 (slightly)
[1] 0.04756044 1.19383584
range(data_validation$NDVI_min, na.rm = T)
[1] -0.7058081 0.8423390
range(data_validation$NDMI_min, na.rm = T)
[1] -0.5601429 0.5758041
range(data_validation$NDWI_min, na.rm = T) # NDWI_min < -1 (slightly)
[1] -1.014830371 -0.008258131
range(data_validation$SAVI_min, na.rm = T)
[1] -0.5367440 0.6335289
range(data_validation$EVI_min, na.rm = T) # EVI_min < -1!
[1] -1.5674420 0.7891647
nrow(data_validation %>% dplyr::filter(NDVI_max > 1))
[1] 18
nrow(data_validation %>% dplyr::filter(NDMI_max > 1))
[1] 51
nrow(data_validation %>% dplyr::filter(EVI_max > 1))
[1] 9
nrow(data_validation %>% dplyr::filter(NDWI_min < -1))
[1] 4
nrow(data_validation %>% dplyr::filter(EVI_min < -1))
[1] 10
Histograms to check that max and min values are ok:
plot_histogram(data_validation, "NDVI_max", "NDVI max")
plot_histogram(data_validation, "NDMI_max", "NDMI max")
plot_histogram(data_validation, "NDWI_max", "NDWI max")
plot_histogram(data_validation, "SAVI_max", "SAVI max")
plot_histogram(data_validation, "EVI_max", "EVI max")
plot_histogram(data_validation, "NDVI_min", "NDVI min")
plot_histogram(data_validation, "NDMI_min", "NDMI min")
plot_histogram(data_validation, "NDWI_min", "NDWI min")
plot_histogram(data_validation, "SAVI_min", "SAVI min")
plot_histogram(data_validation, "EVI_min", "EVI min")
nrow(data_validation %>%
dplyr::filter(EVI_max > 1 | EVI_max < -1))
[1] 9
data_validation %>%
dplyr::filter(EVI_max > 1 | EVI_max < -1) %>%
count(biogeo, unit)
Most EVI values are ok!
Distribution plots:
distr_plot(data_validation %>% dplyr::filter(EVI_min > -0.5),
c("NDVI_max", "EVI_max", "SAVI_max", "NDMI_max", "NDWI_max",
"NDVI_min", "EVI_min", "SAVI_min", "NDMI_min", "NDWI_min"),
c("NDVI_max", "EVI_max", "SAVI_max", "NDMI_max", "NDWI_max",
"NDVI_min", "EVI_min", "SAVI_min", "NDMI_min", "NDWI_min"))
distr_plot(data_validation, "canopy_height", "Canopy height (m)")
ggplot(data_validation %>%
mutate(CH_cat =
factor(
case_when(canopy_height == 0 ~ "0 m",
canopy_height > 0 & canopy_height <= 1 ~ "0-1 m",
canopy_height > 1 & canopy_height <=2 ~ "1-2 m",
canopy_height > 2 & canopy_height <=5 ~ "2-5 m",
canopy_height > 5 & canopy_height <=8 ~ "5-8 m",
canopy_height > 8 ~ "> 8 m",
is.na(canopy_height) ~ NA_character_),
levels = c(
"0 m", "0-1 m", "1-2 m", "2-5 m", "5-8 m", "> 8 m"))),
aes(x = EUNISa_1_descr, fill = CH_cat)) +
geom_bar() + theme_bw() + coord_flip() +
scale_y_continuous(labels = label_number()) +
scale_fill_viridis_d(direction = -1) +
labs(x = "EUNIS level 1", fill = "Canopy height") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
theme(legend.position = c(0.8, 0.75),
legend.direction = "vertical")
data_validation %>%
group_by(EUNISa_1_descr) %>%
summarise(across(canopy_height, list(
mean = mean,
median = median,
sd = sd,
min = min,
max = max
), na.rm = TRUE))
Only using NDVI- and SAVI-based values so far.
Maximum NDVI should be equal to value at peak?
nrow(data_validation %>% dplyr::filter(NDVI_pos_value != NDVI_max))
[1] 584
Not sure why this happens, but in most cases the difference is small (< -0.1). Anyway, we should use only one of these two in the RF models.
plot_histogram(data_validation, "NDVI_sos_slope_doy", "NDVI_sos_slope_doy")
plot_histogram(data_validation, "NDVI_pos_doy", "NDVI_pos_doy")
plot_histogram(data_validation, "NDVI_eos_slope_doy", "NDVI_eos_slope_doy")
plot_histogram(data_validation, "EVI_sos_slope_doy", "EVI_sos_slope_doy")
plot_histogram(data_validation, "EVI_pos_doy", "EVI_pos_doy")
plot_histogram(data_validation, "EVI_eos_slope_doy", "EVI_eos_slope_doy")
plot_histogram(data_validation, "SAVI_sos_slope_doy", "SAVI_sos_slope_doy")
plot_histogram(data_validation, "SAVI_pos_doy", "SAVI_pos_doy")
plot_histogram(data_validation, "SAVI_eos_slope_doy", "SAVI_eos_slope_doy")
plot_histogram(data_validation, "NDVI_sos_threshold_doy", "NDVI_sos_threshold_doy")
plot_histogram(data_validation, "NDVI_pos_doy", "NDVI_pos_doy")
plot_histogram(data_validation, "NDVI_eos_threshold_doy", "NDVI_eos_threshold_doy")
plot_histogram(data_validation, "EVI_sos_threshold_doy", "EVI_sos_threshold_doy")
plot_histogram(data_validation, "EVI_pos_doy", "EVI_pos_doy")
plot_histogram(data_validation, "EVI_eos_threshold_doy", "EVI_eos_threshold_doy")
plot_histogram(data_validation, "SAVI_sos_threshold_doy", "SAVI_sos_threshold_doy")
plot_histogram(data_validation, "SAVI_pos_doy", "SAVI_pos_doy")
plot_histogram(data_validation, "SAVI_eos_threshold_doy", "SAVI_eos_threshold_doy")
plot_histogram(data_validation, "NDVI_sos_slope_value", "NDVI_sos_slope_value")
plot_histogram(data_validation, "NDVI_pos_value", "NDVI_pos_value")
plot_histogram(data_validation, "NDVI_eos_slope_value", "NDVI_eos_slope_value")
plot_histogram(data_validation, "EVI_sos_slope_value", "EVI_sos_slope_value")
plot_histogram(data_validation, "EVI_pos_value", "EVI_pos_value")
plot_histogram(data_validation, "EVI_eos_slope_value", "EVI_eos_slope_value")
plot_histogram(data_validation, "NDVI_sos_threshold_value", "NDVI_sos_threshold_value")
plot_histogram(data_validation, "NDVI_pos_value", "NDVI_pos_value")
plot_histogram(data_validation, "NDVI_eos_threshold_value", "NDVI_eos_threshold_value")
plot_histogram(data_validation, "EVI_sos_threshold_value", "EVI_sos_threshold_value")
plot_histogram(data_validation, "EVI_pos_value", "EVI_pos_value")
plot_histogram(data_validation, "EVI_eos_threshold_value", "EVI_eos_threshold_value")
plot_histogram(data_validation, "NDVI_slope_gsd", "NDVI_slope_gsd")
plot_histogram(data_validation, "EVI_slope_gsd", "EVI_slope_gsd")
plot_histogram(data_validation, "SAVI_slope_gsd", "SAVI_slope_gsd")
plot_histogram(data_validation, "NDVI_diff_pos_sos_slope_value",
"NDVI_diff_pos_sos_slope_value")
plot_histogram(data_validation, "EVI_diff_pos_sos_slope_value",
"EVI_diff_pos_sos_slope_value")
plot_histogram(data_validation, "SAVI_diff_pos_sos_slope_value",
"SAVI_diff_pos_sos_slope_value")
plot_histogram(data_validation, "NDVI_diff_pos_eos_slope_value",
"NDVI_diff_pos_eos_slope_value")
plot_histogram(data_validation, "EVI_diff_pos_eos_slope_value",
"EVI_diff_pos_eos_slope_value")
plot_histogram(data_validation, "SAVI_diff_pos_eos_slope_value",
"SAVI_diff_pos_eos_slope_value")
plot_histogram(data_validation, "NDVI_diff_pos_sos_slope_doy",
"NDVI_diff_pos_sos_slope_doy")
plot_histogram(data_validation, "EVI_diff_pos_sos_slope_doy",
"EVI_diff_pos_sos_slope_doy")
plot_histogram(data_validation, "SAVI_diff_pos_sos_slope_doy",
"SAVI_diff_pos_sos_slope_doy")
plot_histogram(data_validation, "NDVI_diff_pos_eos_slope_doy",
"NDVI_diff_pos_eos_slope_doy")
plot_histogram(data_validation, "EVI_diff_pos_eos_slope_doy",
"EVI_diff_pos_eos_slope_doy")
plot_histogram(data_validation, "SAVI_diff_pos_eos_slope_doy",
"SAVI_diff_pos_eos_slope_doy")
plot_histogram(data_validation, "NDVI_threshold_gsd", "NDVI_threshold_gsd")
plot_histogram(data_validation, "EVI_threshold_gsd", "EVI_threshold_gsd")
plot_histogram(data_validation, "SAVI_threshold_gsd", "SAVI_threshold_gsd")
plot_histogram(data_validation, "NDVI_diff_pos_sos_threshold_value",
"NDVI_diff_pos_sos_threshold_value")
plot_histogram(data_validation, "EVI_diff_pos_sos_threshold_value",
"EVI_diff_pos_sos_threshold_value")
plot_histogram(data_validation, "SAVI_diff_pos_sos_threshold_value",
"SAVI_diff_pos_sos_threshold_value")
plot_histogram(data_validation, "NDVI_diff_pos_eos_threshold_value",
"NDVI_diff_pos_eos_threshold_value")
plot_histogram(data_validation, "EVI_diff_pos_eos_threshold_value",
"EVI_diff_pos_eos_threshold_value")
plot_histogram(data_validation, "SAVI_diff_pos_eos_threshold_value",
"SAVI_diff_pos_eos_threshold_value")
plot_histogram(data_validation, "NDVI_diff_pos_sos_threshold_doy",
"NDVI_diff_pos_sos_threshold_doy")
plot_histogram(data_validation, "EVI_diff_pos_sos_threshold_doy",
"EVI_diff_pos_sos_threshold_doy")
plot_histogram(data_validation, "SAVI_diff_pos_sos_threshold_doy",
"SAVI_diff_pos_sos_threshold_doy")
plot_histogram(data_validation, "NDVI_diff_pos_eos_threshold_doy",
"NDVI_diff_pos_eos_threshold_doy")
plot_histogram(data_validation, "EVI_diff_pos_eos_threshold_doy",
"EVI_diff_pos_eos_threshold_doy")
plot_histogram(data_validation, "SAVI_diff_pos_eos_threshold_doy",
"SAVI_diff_pos_eos_threshold_doy")
plot_histogram(data_validation, "NDVI_auc_slope", "NDVI_auc_slope")
plot_histogram(data_validation, "EVI_auc_slope", "EVI_auc_slope")
plot_histogram(data_validation, "SAVI_auc_slope", "SAVI_auc_slope")
plot_histogram(data_validation, "NDVI_auc_threshold", "NDVI_auc_threshold")
plot_histogram(data_validation, "EVI_auc_threshold", "EVI_auc_threshold")
plot_histogram(data_validation, "SAVI_auc_threshold", "SAVI_auc_threshold")
distr_plot(data_validation,
c("NDVI_sos_slope_value","NDVI_pos_value", "NDVI_eos_slope_value",
"EVI_sos_slope_value","EVI_pos_value", "EVI_eos_slope_value",
"SAVI_sos_slope_value", "SAVI_pos_value", "SAVI_eos_slope_value",
"NDVI_sos_slope_doy","NDVI_pos_doy", "NDVI_eos_slope_doy",
"EVI_sos_slope_doy","EVI_pos_doy", "EVI_eos_slope_doy",
"SAVI_sos_slope_doy", "SAVI_pos_doy", "SAVI_eos_slope_doy",
"NDVI_sos_threshold_value","NDVI_pos_value", "NDVI_eos_threshold_value",
"EVI_sos_threshold_value","EVI_pos_value", "EVI_eos_threshold_value",
"SAVI_sos_threshold_value", "SAVI_pos_value", "SAVI_eos_threshold_value",
"NDVI_sos_threshold_doy","NDVI_pos_doy", "NDVI_eos_threshold_doy",
"EVI_sos_threshold_doy","EVI_pos_doy", "EVI_eos_threshold_doy",
"SAVI_sos_threshold_doy", "SAVI_pos_doy", "SAVI_eos_threshold_doy"),
c("NDVI_sos_slope_value","NDVI_pos_value", "NDVI_eos_slope_value",
"EVI_sos_slope_value","EVI_pos_value", "EVI_eos_slope_value",
"SAVI_sos_slope_value", "SAVI_pos_value", "SAVI_eos_slope_value",
"NDVI_sos_slope_doy","NDVI_pos_doy", "NDVI_eos_slope_doy",
"EVI_sos_slope_doy","EVI_pos_doy", "EVI_eos_slope_doy",
"SAVI_sos_slope_doy", "SAVI_pos_doy", "SAVI_eos_slope_doy",
"NDVI_sos_threshold_value","NDVI_pos_value", "NDVI_eos_threshold_value",
"EVI_sos_threshold_value","EVI_pos_value", "EVI_eos_threshold_value",
"SAVI_sos_threshold_value", "SAVI_pos_value", "SAVI_eos_threshold_value",
"NDVI_sos_threshold_doy","NDVI_pos_doy", "NDVI_eos_threshold_doy",
"EVI_sos_threshold_doy","EVI_pos_doy", "EVI_eos_threshold_doy",
"SAVI_sos_threshold_doy", "SAVI_pos_doy", "SAVI_eos_threshold_doy")
)
distr_plot(data_validation,
c("NDVI_slope_gsd","EVI_slope_gsd", "SAVI_slope_gsd",
"NDVI_diff_pos_sos_slope_value", "EVI_diff_pos_sos_slope_value",
"SAVI_diff_pos_sos_slope_value",
"NDVI_diff_pos_eos_slope_value", "EVI_diff_pos_eos_slope_value",
"SAVI_diff_pos_eos_slope_value",
"NDVI_diff_pos_sos_slope_doy", "EVI_diff_pos_sos_slope_doy",
"SAVI_diff_pos_sos_slope_doy",
"NDVI_diff_pos_eos_slope_doy", "EVI_diff_pos_eos_slope_doy",
"SAVI_diff_pos_eos_slope_doy",
"NDVI_threshold_gsd","EVI_threshold_gsd", "SAVI_threshold_gsd",
"NDVI_diff_pos_sos_threshold_value", "EVI_diff_pos_sos_threshold_value",
"SAVI_diff_pos_sos_threshold_value",
"NDVI_diff_pos_eos_threshold_value", "EVI_diff_pos_eos_threshold_value",
"SAVI_diff_pos_eos_threshold_value",
"NDVI_diff_pos_sos_threshold_doy", "EVI_diff_pos_sos_threshold_doy",
"SAVI_diff_pos_sos_threshold_doy",
"NDVI_diff_pos_eos_threshold_doy", "EVI_diff_pos_eos_threshold_doy",
"SAVI_diff_pos_eos_threshold_doy"),
c("NDVI_slope_gsd","EVI_slope_gsd", "SAVI_slope_gsd",
"NDVI_diff_pos_sos_slope_value", "EVI_diff_pos_sos_slope_value",
"SAVI_diff_pos_sos_slope_value",
"NDVI_diff_pos_eos_slope_value", "EVI_diff_pos_eos_slope_value",
"SAVI_diff_pos_eos_slope_value",
"NDVI_diff_pos_sos_slope_doy", "EVI_diff_pos_sos_slope_doy",
"SAVI_diff_pos_sos_slope_doy",
"NDVI_diff_pos_eos_slope_doy", "EVI_diff_pos_eos_slope_doy",
"SAVI_diff_pos_eos_slope_doy",
"NDVI_threshold_gsd","EVI_threshold_gsd", "SAVI_threshold_gsd",
"NDVI_diff_pos_sos_threshold_value", "EVI_diff_pos_sos_threshold_value",
"SAVI_diff_pos_sos_threshold_value",
"NDVI_diff_pos_eos_threshold_value", "EVI_diff_pos_eos_threshold_value",
"SAVI_diff_pos_eos_threshold_value",
"NDVI_diff_pos_sos_threshold_doy", "EVI_diff_pos_sos_threshold_doy",
"SAVI_diff_pos_sos_threshold_doy",
"NDVI_diff_pos_eos_threshold_doy", "EVI_diff_pos_eos_threshold_doy",
"SAVI_diff_pos_eos_threshold_doy")
)
distr_plot(data_validation,
c("NDVI_auc_slope", "EVI_auc_slope", "SAVI_auc_slope",
"NDVI_auc_threshold", "EVI_auc_threshold", "SAVI_auc_threshold",
"NDVI_auc_mar_oct", "EVI_auc_mar_oct", "SAVI_auc_mar_oct"),
c("NDVI_auc_slope", "EVI_auc_slope", "SAVI_auc_slope",
"NDVI_auc_threshold", "EVI_auc_threshold", "SAVI_auc_threshold",
"NDVI_auc_mar_oct", "EVI_auc_mar_oct", "SAVI_auc_mar_oct"))
# Define a function to create plots with violin + boxplot + points
distr_plot_biogeo <- function(data, y_vars, y_labels) {
plots <- list()
for (i in seq_along(y_vars)) {
y_var <- y_vars[[i]]
y_label <- y_labels[[i]]
p <- ggplot(data = data %>%
dplyr::filter(EUNISa_1 %in% c("T", "R", "S", "Q")),
aes(x = EUNISa_1_descr, y = !!sym(y_var), fill = EUNISa_1_descr)) +
geom_flat_violin(position = position_nudge(x = 0.2, y = 0), alpha = 0.8) +
geom_point(aes(y = !!sym(y_var), color = EUNISa_1_descr),
position = position_jitter(width = 0.15), size = 1, alpha = 0.25) +
geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.5) +
stat_summary(fun.y = mean, geom = "point", shape = 20, size = 1) +
stat_summary(fun.data = function(x) data.frame(y = max(x) + 0.1,
label = length(x)),
geom = "text", aes(label = ..label..), vjust = 0.5) +
labs(y = y_label, x = "EUNISa_1_descr") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
guides(fill = FALSE, color = FALSE) +
theme_bw() + coord_flip() + facet_wrap(~ biogeo)
plots[[y_var]] <- p
}
return(plots)
}
Distribution plots:
distr_plot_biogeo(data_validation, "canopy_height", "Canopy height (m)")
$canopy_height
RF models fitted using the conditional inference version of random forest (first cforest in package party, now fastcforest in package moreparty). Suggested if the data are highly correlated. Cforest is more stable in deriving variable importance values in the presence of highly correlated variables, thus providing better accuracy in calculating variable importance (ref below).
Hothorn, T., Hornik, K. and Zeileis, A. (2006) Unbiased Recursive Portioning: A Conditional Inference Framework. Journal of Computational and Graphical Statistics, 15, 651- 674. http://dx.doi.org/10.1198/106186006X133933
Choose the hyperparameter mtry based on the square root of the number of predictor variables:
Hastie, T., Tibshirani, R., & Friedman, J. (2009). The elements of statistical learning: Data mining, inference, and prediction. Springer Science & Business Media.
Maybe TO_DO: We variated ntree from 50 to 800 in steps of 50, leaving mtry constant at 2. Tis parameter variation showed that ntree=500 was optimal, while higher ntree led to no further model improvement (Supplementary Fig. S10). Subsequently, the hyperparameter mtry was varied from 2 to 8 with constant ntree=500. Here, mtry=3 led to the best results in almost all cases (Supplementary Fig. S11). Consequently, we chose ntree=500 and mtry=3 for our main analysis across all study sites.
Define a function to run fastcforest models:
run_rf <- function(vars_RF, train_data, response_var, ntree = 500)
{
# Detect and register available cores (leave one free)
n_cores <- parallel::detectCores() - 1
cl <- makeCluster(n_cores)
registerDoParallel(cl)
train_name <- deparse(substitute(train_data))
# Export necessary variables to the cluster
clusterExport(cl, varlist = c("vars_RF", "train_data", "response_var"),
envir = environment())
# Set seed for reproducibility
set.seed(123)
# Measure execution time
execution_time <- system.time({
rf_model <- fastcforest(
formula = reformulate(vars_RF, response = response_var),
data = train_data,
controls = party::cforest_control(
mtry = round(sqrt(length(vars_RF))),
ntree = ntree
),
parallel = TRUE
)
})
# Stop the cluster
stopCluster(cl)
# Return both the model and execution time
list(model = rf_model, time = execution_time)
}
# compute_varimp <- function(model, nperm = 100,
# n_cores = parallel::detectCores() - 1) {
# # Set up parallel backend
# cl <- makeCluster(n_cores)
# registerDoParallel(cl)
#
# # Measure execution time
# execution_time <- system.time({
# varimp_list <- foreach(i = 1:nperm, .combine = '+',
# .packages = "party") %dopar% {
# varimp(model, conditional = FALSE, nperm = 1)
# }
# })
#
# stopCluster(cl)
#
# # Average the results
# varimp_avg <- varimp_list / nperm
#
# return(list(varimp = varimp_avg, time = execution_time))
# }
Using permimp() en permimp package: https://cran.r-project.org/web/packages/permimp/vignettes/permimp-package.html#fn1
compute_varimp <- function(model, nperm = 100) {
# Measure execution time
execution_time <- system.time({
varimp_result <- permimp(model, conditional = FALSE, progressBar = TRUE)
})
return(list(varimp = varimp_result, time = execution_time))
}
compute_varimp_cond <- function(model, nperm = 100) {
# Measure execution time
execution_time <- system.time({
varimp_result <- permimp(model, conditional = TRUE, progressBar = TRUE)
})
return(list(varimp = varimp_result, time = execution_time))
}
compute_roc_level1 <- function(model, test_data) {
# Measure execution time
execution_time <- system.time({
# Step 1: Predict probabilities
probabilities <- predict(model, newdata = test_data, type = "prob")
# Step 2: Convert list of matrices to a proper data frame
prob_matrix <- t(sapply(probabilities, as.vector))
prob_df <- as.data.frame(prob_matrix)
colnames(prob_df) <- c("Q", "R", "S", "T")
# Step 3: Prepare actual class labels
actual <- factor(test_data$EUNISa_1, levels = c("Q", "R", "S", "T"))
# Step 4: Binarize actual labels
actual_bin <- model.matrix(~ actual - 1)
colnames(actual_bin) <- gsub("actual", "", colnames(actual_bin))
# Step 5: Compute ROC data for each class
roc_data <- lapply(levels(actual), function(class) {
roc_obj <- roc(actual_bin[, class], prob_df[[class]])
auc_val <- round(auc(roc_obj), 3)
data.frame(
FPR = rev(roc_obj$specificities),
TPR = rev(roc_obj$sensitivities),
Class = paste0(class, " (AUC = ", auc_val, ")")
)
}) %>% bind_rows()
})
# Return both ROC data and execution time
return(list(roc = roc_data, time = execution_time))
}
compute_roc_level2 <- function(model, test_data) {
# Measure execution time
execution_time <- system.time({
# Step 1: Predict probabilities
probabilities <- predict(model, newdata = test_data, type = "prob")
# Step 2: Convert list of matrices to a proper data frame
prob_matrix <- t(sapply(probabilities, as.vector))
prob_df <- as.data.frame(prob_matrix)
colnames(prob_df) <- c("Q1", "Q2", "Q4", "Q5", "R1", "R2", "R3", "R4", "R5",
"R6", "S3", "S4", "T1", "T3")
# Step 3: Prepare actual class labels
actual <- factor(test_data$EUNISa_2,
levels = c("Q1", "Q2", "Q4", "Q5", "R1", "R2", "R3", "R4",
"R5", "R6", "S3", "S4", "T1", "T3"))
# Step 4: Binarize actual labels
actual_bin <- model.matrix(~ actual - 1)
colnames(actual_bin) <- gsub("actual", "", colnames(actual_bin))
# Step 5: Compute ROC data for each class
roc_data <- lapply(levels(actual), function(class) {
roc_obj <- roc(actual_bin[, class], prob_df[[class]])
auc_val <- round(auc(roc_obj), 3)
data.frame(
FPR = rev(roc_obj$specificities),
TPR = rev(roc_obj$sensitivities),
Class = paste0(class, " (AUC = ", auc_val, ")")
)
}) %>% bind_rows()
})
# Return both ROC data and execution time
return(list(roc = roc_data, time = execution_time))
}
vars_RF_slope <- c(
# Min values of all indices
"NDVI_min", "EVI_min", "NDMI_min", "NDWI_min", "SAVI_min",
# Max values of NDMI and NDWI
"NDMI_max", "NDWI_max",
# AUC of NDVI, EVI and SAVI
"NDVI_auc_slope", "EVI_auc_slope", "SAVI_auc_slope",
# Values of NDVI, EVI and SAVI at sos, pos and eos
"NDVI_sos_slope_value", "NDVI_pos_value", "NDVI_eos_slope_value",
"EVI_sos_slope_value", "EVI_pos_value", "EVI_eos_slope_value",
"SAVI_sos_slope_value", "SAVI_pos_value", "SAVI_eos_slope_value",
# Differences pos-sos in value and doy
"NDVI_diff_pos_sos_slope_value", "EVI_diff_pos_sos_slope_value",
"SAVI_diff_pos_sos_slope_value","NDVI_diff_pos_sos_slope_doy",
"EVI_diff_pos_sos_slope_doy", "SAVI_diff_pos_sos_slope_doy",
# Differences pos-eos in value and doy
"NDVI_diff_pos_eos_slope_value", "EVI_diff_pos_eos_slope_value",
"SAVI_diff_pos_eos_slope_value", "NDVI_diff_pos_eos_slope_doy",
"EVI_diff_pos_eos_slope_doy", "SAVI_diff_pos_eos_slope_doy",
# Growing season duration
"NDVI_slope_gsd", "EVI_slope_gsd", "SAVI_slope_gsd",
# Canopy height
"canopy_height")
vars_RF_threshold <- c(
# Min values of all indices
"NDVI_min", "EVI_min", "NDMI_min", "NDWI_min", "SAVI_min",
# Max values of NDMI and NDWI
"NDMI_max", "NDWI_max",
# AUC of NDVI, EVI and SAVI
"NDVI_auc_threshold", "EVI_auc_threshold", "SAVI_auc_threshold",
# Values of NDVI, EVI and SAVI at sos, pos and eos
"NDVI_sos_threshold_value", "NDVI_pos_value", "NDVI_eos_threshold_value",
"EVI_sos_threshold_value", "EVI_pos_value", "EVI_eos_threshold_value",
"SAVI_sos_threshold_value", "SAVI_pos_value", "SAVI_eos_threshold_value",
# Differences pos-sos in value and doy
"NDVI_diff_pos_sos_threshold_value", "EVI_diff_pos_sos_threshold_value",
"SAVI_diff_pos_sos_threshold_value","NDVI_diff_pos_sos_threshold_doy",
"EVI_diff_pos_sos_threshold_doy", "SAVI_diff_pos_sos_threshold_doy",
# Differences pos-eos in value and doy
"NDVI_diff_pos_eos_threshold_value", "EVI_diff_pos_eos_threshold_value",
"SAVI_diff_pos_eos_threshold_value", "NDVI_diff_pos_eos_threshold_doy",
"EVI_diff_pos_eos_threshold_doy", "SAVI_diff_pos_eos_threshold_doy",
# Growing season duration
"NDVI_threshold_gsd", "EVI_threshold_gsd", "SAVI_threshold_gsd",
# Canopy height
"canopy_height")
vars_RF_months <- c(
# Min values of all indices
"NDVI_min", "EVI_min", "NDMI_min", "NDWI_min", "SAVI_min",
# Max values of NDMI and NDWI
"NDMI_max", "NDWI_max",
# AUC of NDVI, EVI and SAVI between march and oct
"NDVI_auc_mar_oct", "EVI_auc_mar_oct", "SAVI_auc_mar_oct",
# Values of NDVI, EVI and SAVI at march, pos and oct
"NDVI_avg_value_03", "NDVI_pos_value", "NDVI_avg_value_10",
"EVI_avg_value_03", "EVI_pos_value", "EVI_avg_value_10",
"SAVI_avg_value_03", "SAVI_pos_value", "SAVI_avg_value_10",
# Differences pos-march in value and doy
"NDVI_diff_pos_march_value", "EVI_diff_pos_march_value",
"SAVI_diff_pos_march_value","NDVI_diff_pos_march_doy",
"EVI_diff_pos_march_doy", "SAVI_diff_pos_march_doy",
# Differences pos-oct in value and doy
"NDVI_diff_pos_oct_value", "EVI_diff_pos_oct_value",
"SAVI_diff_pos_oct_value", "NDVI_diff_pos_oct_doy",
"EVI_diff_pos_oct_doy", "SAVI_diff_pos_oct_doy",
# Canopy height
"canopy_height")
Define a set of rules for a first validation of ALL ReSurvey data (“Expert-based” rules). Not very ambitious, only taking out observations that are clearly wrong on the basis of indicator values.
Create column for first validation based on different indicators, where “wrong” is noted when the validation rule is not met.
Define rules:
data_validation <-
data_validation %>%
mutate(
valid_1_NDWI = case_when(
# Points that are basically water
NDWI_max > 0.3 ~ "wrong",
TRUE ~ NA_character_),
valid_1_CH = case_when(
# R & Q points with high CH
EUNISa_1 %in% c("R", "Q") & canopy_height > 2 ~ "wrong",
# T points with low CH
EUNISa_1 == "T" & canopy_height < 3 ~ "wrong",
# S points with high CH
EUNISa_1 == "S" & canopy_height > 3 ~ "wrong",
TRUE ~ NA_character_),
# No rules for NDVI, we will take out observations based on distributions
# Count how many validation rules are not met
valid_1_count = rowSums(across(c(valid_1_NDWI, valid_1_CH), ~ . == "wrong"),
na.rm = TRUE),
# Points where at least 1 rule not met
valid_1 = if_else(valid_1_count > 0, "At least 1 rule broken",
"No rules broken so far")
)
filtered_data0_slope <- data_validation %>%
mutate(EUNISa_1 = as.factor(EUNISa_1)) %>%
# Remove all rows with wrong values of indices (not between -1 and 1)
dplyr::filter(EVI_max <= 1 & EVI_min >= -1) %>%
dplyr::filter(NDVI_max <= 1) %>%
dplyr::filter(NDMI_max <= 1) %>%
dplyr::filter(NDWI_min >= -1) %>%
# Remove rows with missing values in predictors
dplyr::filter(if_all(all_of(vars_RF_slope), ~ !is.na(.))) %>%
# Select only variables needed
select(PlotObservationID, EUNISa_1, all_of(vars_RF_slope), Lctnmth, valid_1) %>%
# Keep only rows with differences > 0
dplyr::filter(if_all(contains("diff"), ~ .x > 0))
filtered_data0_threshold <- data_validation %>%
mutate(EUNISa_1 = as.factor(EUNISa_1)) %>%
# Remove all rows with wrong values of indices (not between -1 and 1)
dplyr::filter(EVI_max <= 1 & EVI_min >= -1) %>%
dplyr::filter(NDVI_max <= 1) %>%
dplyr::filter(NDMI_max <= 1) %>%
dplyr::filter(NDWI_min >= -1) %>%
# Remove rows with missing values in predictors
dplyr::filter(if_all(all_of(vars_RF_threshold), ~ !is.na(.))) %>%
# Select only variables needed
select(PlotObservationID, EUNISa_1, all_of(vars_RF_threshold), Lctnmth, valid_1) %>%
# Keep only rows with differences > 0
dplyr::filter(if_all(contains("diff"), ~ .x > 0))
filtered_data0_months <- data_validation %>%
mutate(EUNISa_1 = as.factor(EUNISa_1)) %>%
# Remove all rows with wrong values of indices (not between -1 and 1)
dplyr::filter(EVI_max <= 1 & EVI_min >= -1) %>%
dplyr::filter(NDVI_max <= 1) %>%
dplyr::filter(NDMI_max <= 1) %>%
dplyr::filter(NDWI_min >= -1) %>%
# Remove rows with missing values in predictors
dplyr::filter(if_all(all_of(vars_RF_months), ~ !is.na(.))) %>%
# Select only variables needed
select(PlotObservationID, EUNISa_1, all_of(vars_RF_months), Lctnmth, valid_1) %>%
# Keep only rows with differences > 0
dplyr::filter(if_all(contains("diff"), ~ .x > 0))
Correlation of all variables to be included in RF models:
corrplot(filtered_data0_slope %>%
dplyr::select(all_of(vars_RF_slope)) %>%
cor(use = "pairwise.complete.obs"),
method = "color", type = "upper", tl.col = "black", tl.srt = 45)
# Compute correlation matrix
cor_matrix_slope <- filtered_data0_slope %>%
dplyr::select(all_of(vars_RF_slope)) %>%
cor(use = "pairwise.complete.obs")
# Set the diagonal to 0 to ignore self-correlation
diag(cor_matrix_slope) <- 0
# Find variables where all absolute correlations are < 0.7
keep_vars <- apply(abs(cor_matrix_slope), 1, function(x) all(x < 0.7))
# Subset the original data frame to keep only those variables
filtered_data0_slope_nocorr <- (filtered_data0_slope %>%
dplyr::select(all_of(vars_RF_slope)))[, keep_vars]
# 1: Slope method, no previous validation, GPS points
filtered_data1_slope <- filtered_data0_slope %>%
# Select only GPS points
dplyr::filter(Lctnmth == "Location with GPS" |
Lctnmth == "Location with differential GPS") %>%
select(-Lctnmth, -valid_1)
# 2: Slope method, no previous validation, GPS diff points
filtered_data2_slope <- filtered_data0_slope %>%
# Select only GPS diff points
dplyr::filter(Lctnmth == "Location with differential GPS") %>%
select(-Lctnmth, -valid_1)
# 3: Slope method, rough validation, GPS points
filtered_data3_slope <- filtered_data0_slope %>%
# Select only GPS points
dplyr::filter(Lctnmth == "Location with GPS" |
Lctnmth == "Location with differential GPS") %>%
# Filter out points that have not passed the rough validation
dplyr::filter(valid_1 == "No rules broken so far") %>%
select(-Lctnmth, -valid_1)
# 4: Slope method, rough validation, GPS diff points
filtered_data4_slope <- filtered_data0_slope %>%
# Select only GPS diff points
dplyr::filter(Lctnmth == "Location with differential GPS") %>%
# Filter out points that have not passed the rough validation
dplyr::filter(valid_1 == "No rules broken so far") %>%
select(-Lctnmth, -valid_1)
# 5: Slope method, rough validation + refinement 10-90th, GPS points
# 6: Slope method, rough validation + refinement 10-90th, GPS diff points
# 7: Slope method, rough validation + refinement 20-80th, GPS points
# 8: Slope method, rough validation + refinement 20-80th, GPS diff points
# 9: Threshold method, no previous validation, GPS points
filtered_data9_threshold <- filtered_data0_threshold %>%
# Select only GPS points
dplyr::filter(Lctnmth == "Location with GPS" |
Lctnmth == "Location with differential GPS") %>%
select(-Lctnmth, -valid_1)
# 10: Threshold method, no previous validation, GPS diff points
filtered_data10_threshold <- filtered_data0_threshold %>%
# Select only GPS diff points
dplyr::filter(Lctnmth == "Location with differential GPS") %>%
select(-Lctnmth, -valid_1)
# 11: Threshold method, rough validation, GPS points
filtered_data11_threshold <- filtered_data0_threshold %>%
# Select only GPS points
dplyr::filter(Lctnmth == "Location with GPS" |
Lctnmth == "Location with differential GPS") %>%
# Filter out points that have not passed the rough validation
dplyr::filter(valid_1 == "No rules broken so far") %>%
select(-Lctnmth, -valid_1)
# 12: Threshold method, rough validation, GPS diff points
filtered_data12_threshold <- filtered_data0_threshold %>%
# Select only GPS diff points
dplyr::filter(Lctnmth == "Location with differential GPS") %>%
# Filter out points that have not passed the rough validation
dplyr::filter(valid_1 == "No rules broken so far") %>%
select(-Lctnmth, -valid_1)
# 13: Threshold method, rough validation + refinement 10-90th, GPS points
# 14: Threshold method, rough validation + refinement 10-90th, GPS diff points
# 15: Threshold method, rough validation + refinement 20-80th, GPS points
# 16: Threshold method, rough validation + refinement 20-80th, GPS diff points
# 17: Months method, no previous validation, GPS points
filtered_data17_months <- filtered_data0_months %>%
# Select only GPS points
dplyr::filter(Lctnmth == "Location with GPS" |
Lctnmth == "Location with differential GPS") %>%
select(-Lctnmth, -valid_1)
# 18: Months method, no previous validation, GPS diff points
filtered_data18_months <- filtered_data0_months %>%
# Select only GPS diff points
dplyr::filter(Lctnmth == "Location with differential GPS") %>%
select(-Lctnmth, -valid_1)
# 19: Months method, rough validation, GPS points
filtered_data19_months <- filtered_data0_months %>%
# Select only GPS points
dplyr::filter(Lctnmth == "Location with GPS" |
Lctnmth == "Location with differential GPS") %>%
# Filter out points that have not passed the rough validation
dplyr::filter(valid_1 == "No rules broken so far") %>%
select(-Lctnmth, -valid_1)
# 20: Months method, rough validation, GPS diff points
filtered_data20_months <- filtered_data0_months %>%
# Select only GPS diff points
dplyr::filter(Lctnmth == "Location with differential GPS") %>%
# Filter out points that have not passed the rough validation
dplyr::filter(valid_1 == "No rules broken so far") %>%
select(-Lctnmth, -valid_1)
# 22: Months method, rough validation + refinement 10-90th, GPS diff points
# 23: Months method, rough validation + refinement 20-80th, GPS points
# 24: Months method, rough validation + refinement 20-80th, GPS diff points
bind_rows(
filtered_data1_slope %>% count(EUNISa_1) %>%
pivot_wider(names_from = EUNISa_1, values_from = n) %>%
mutate(data = "filtered_data1_slope") %>%
select(data, Q, R, S, T),
filtered_data2_slope %>% count(EUNISa_1) %>%
pivot_wider(names_from = EUNISa_1, values_from = n) %>%
mutate(data = "filtered_data2_slope") %>%
select(data, Q, R, S, T),
filtered_data3_slope %>% count(EUNISa_1) %>%
pivot_wider(names_from = EUNISa_1, values_from = n) %>%
mutate(data = "filtered_data3_slope") %>%
select(data, Q, R, S, T),
filtered_data4_slope %>% count(EUNISa_1) %>%
pivot_wider(names_from = EUNISa_1, values_from = n) %>%
mutate(data = "filtered_data4_slope") %>%
select(data, Q, R, S, T),
# filtered_data5_slope %>% count(EUNISa_1) %>%
# pivot_wider(names_from = EUNISa_1, values_from = n) %>%
# mutate(data = "filtered_data5_slope") %>%
# select(data, Q, R, S, T),
# iltered_data6_slope %>% count(EUNISa_1) %>%
# pivot_wider(names_from = EUNISa_1, values_from = n) %>%
# mutate(data = "filtered_data6_slope") %>%
# select(data, Q, R, S, T),
# filtered_data7_slope %>% count(EUNISa_1) %>%
# pivot_wider(names_from = EUNISa_1, values_from = n) %>%
# mutate(data = "filtered_data7_slope") %>%
# select(data, Q, R, S, T),
# filtered_data8_slope %>% count(EUNISa_1) %>%
# pivot_wider(names_from = EUNISa_1, values_from = n) %>%
# mutate(data = "filtered_data8_slope") %>%
# select(data, Q, R, S, T),
filtered_data9_threshold %>% count(EUNISa_1) %>%
pivot_wider(names_from = EUNISa_1, values_from = n) %>%
mutate(data = "filtered_data9_threshold") %>%
select(data, Q, R, S, T),
filtered_data10_threshold %>% count(EUNISa_1) %>%
pivot_wider(names_from = EUNISa_1, values_from = n) %>%
mutate(data = "filtered_data9_threshold") %>%
select(data, Q, R, S, T),
filtered_data11_threshold %>% count(EUNISa_1) %>%
pivot_wider(names_from = EUNISa_1, values_from = n) %>%
mutate(data = "filtered_data9_threshold") %>%
select(data, Q, R, S, T),
filtered_data12_threshold %>% count(EUNISa_1) %>%
pivot_wider(names_from = EUNISa_1, values_from = n) %>%
mutate(data = "filtered_data9_threshold") %>%
select(data, Q, R, S, T),
# filtered_data13_threshold %>% count(EUNISa_1) %>%
# pivot_wider(names_from = EUNISa_1, values_from = n) %>%
# mutate(data = "filtered_data9_threshold") %>%
# select(data, Q, R, S, T),
# filtered_data14_threshold %>% count(EUNISa_1) %>%
# pivot_wider(names_from = EUNISa_1, values_from = n) %>%
# mutate(data = "filtered_data9_threshold") %>%
# select(data, Q, R, S, T),
# filtered_data15_threshold %>% count(EUNISa_1) %>%
# pivot_wider(names_from = EUNISa_1, values_from = n) %>%
# mutate(data = "filtered_data9_threshold") %>%
# select(data, Q, R, S, T),
# filtered_data16_threshold %>% count(EUNISa_1) %>%
# pivot_wider(names_from = EUNISa_1, values_from = n) %>%
# mutate(data = "filtered_data9_threshold") %>%
# select(data, Q, R, S, T),
filtered_data17_months %>% count(EUNISa_1) %>%
pivot_wider(names_from = EUNISa_1, values_from = n) %>%
mutate(data = "filtered_data17_months") %>%
select(data, Q, R, S, T),
filtered_data18_months %>% count(EUNISa_1) %>%
pivot_wider(names_from = EUNISa_1, values_from = n) %>%
mutate(data = "filtered_data18_months") %>%
select(data, Q, R, S, T),
filtered_data19_months %>% count(EUNISa_1) %>%
pivot_wider(names_from = EUNISa_1, values_from = n) %>%
mutate(data = "filtered_data19_months") %>%
select(data, Q, R, S, T),
filtered_data20_months %>% count(EUNISa_1) %>%
pivot_wider(names_from = EUNISa_1, values_from = n) %>%
mutate(data = "filtered_data20_months") %>%
select(data, Q, R, S, T)
# filtered_data21_months %>% count(EUNISa_1) %>%
# pivot_wider(names_from = EUNISa_1, values_from = n) %>%
# mutate(data = "filtered_data21_months") %>%
# select(data, Q, R, S, T),
# filtered_data22_months %>% count(EUNISa_1) %>%
# pivot_wider(names_from = EUNISa_1, values_from = n) %>%
# mutate(data = "filtered_data22_months") %>%
# select(data, Q, R, S, T),
# filtered_data23_months %>% count(EUNISa_1) %>%
# pivot_wider(names_from = EUNISa_1, values_from = n) %>%
# mutate(data = "filtered_data23_months") %>%
# select(data, Q, R, S, T),
# filtered_data24_months %>% count(EUNISa_1) %>%
# pivot_wider(names_from = EUNISa_1, values_from = n) %>%
# mutate(data = "filtered_data24_months") %>%
# select(data, Q, R, S, T),
)
set.seed(123)
train_indices1 <- sample(1:nrow(filtered_data1_slope),
0.7 * nrow(filtered_data1_slope))
train_indices2 <- sample(1:nrow(filtered_data2_slope),
0.7 * nrow(filtered_data2_slope))
train_indices3 <- sample(1:nrow(filtered_data3_slope),
0.7 * nrow(filtered_data3_slope))
train_indices4 <- sample(1:nrow(filtered_data4_slope),
0.7 * nrow(filtered_data4_slope))
# train_indices5 <- sample(1:nrow(filtered_data5_slope),
# 0.7 * nrow(filtered_data5_slope))
# train_indices6 <- sample(1:nrow(filtered_data6_slope),
# 0.7 * nrow(filtered_data6_slope))
# train_indices7 <- sample(1:nrow(filtered_data7_slope),
# 0.7 * nrow(filtered_data7_slope))
# train_indices8 <- sample(1:nrow(filtered_data8_slope),
# 0.7 * nrow(filtered_data8_slope))
train_indices9 <- sample(1:nrow(filtered_data9_threshold),
0.7 * nrow(filtered_data9_threshold))
train_indices10 <- sample(1:nrow(filtered_data10_threshold),
0.7 * nrow(filtered_data10_threshold))
train_indices11 <- sample(1:nrow(filtered_data11_threshold),
0.7 * nrow(filtered_data11_threshold))
train_indices12 <- sample(1:nrow(filtered_data12_threshold),
0.7 * nrow(filtered_data12_threshold))
# train_indices13 <- sample(1:nrow(filtered_data13_threshold),
# 0.7 * nrow(filtered_data13_threshold))
# train_indices14 <- sample(1:nrow(filtered_data14_threshold),
# 0.7 * nrow(filtered_data14_threshold))
# train_indices15 <- sample(1:nrow(filtered_data15_threshold),
# 0.7 * nrow(filtered_data15_threshold))
# train_indices16 <- sample(1:nrow(filtered_data16_threshold),
# 0.7 * nrow(filtered_data16_threshold))
train_indices17 <- sample(1:nrow(filtered_data17_months),
0.7 * nrow(filtered_data17_months))
train_indices18 <- sample(1:nrow(filtered_data18_months),
0.7 * nrow(filtered_data18_months))
train_indices19 <- sample(1:nrow(filtered_data19_months),
0.7 * nrow(filtered_data19_months))
train_indices20 <- sample(1:nrow(filtered_data20_months),
0.7 * nrow(filtered_data20_months))
# train_indices21 <- sample(1:nrow(filtered_data21_months),
# 0.7 * nrow(filtered_data21_months))
# train_indices22 <- sample(1:nrow(filtered_data22_months),
# 0.7 * nrow(filtered_data22_months))
# train_indices23 <- sample(1:nrow(filtered_data23_months),
# 0.7 * nrow(filtered_data23_months))
# train_indices24 <- sample(1:nrow(filtered_data24_months),
# 0.7 * nrow(filtered_data24_months))
train_data1 <- filtered_data1_slope[train_indices1, ]
train_data2 <- filtered_data2_slope[train_indices2, ]
train_data3 <- filtered_data3_slope[train_indices3, ]
train_data4 <- filtered_data4_slope[train_indices4, ]
# train_data5 <- filtered_data5_slope[train_indices5, ]
# train_data6 <- filtered_data6_slope[train_indices6, ]
# train_data7 <- filtered_data7_slope[train_indices7, ]
# train_data8 <- filtered_data8_slope[train_indices8, ]
train_data9 <- filtered_data9_threshold[train_indices9, ]
train_data10 <- filtered_data10_threshold[train_indices10, ]
train_data11 <- filtered_data11_threshold[train_indices11, ]
train_data12 <- filtered_data12_threshold[train_indices12, ]
# train_data13 <- filtered_data13_threshold[train_indices13, ]
# train_data14 <- filtered_data14_threshold[train_indices14, ]
# train_data15 <- filtered_data15_threshold[train_indices15, ]
# train_data16 <- filtered_data16_threshold[train_indices16, ]
train_data17 <- filtered_data17_months[train_indices17, ]
train_data18 <- filtered_data18_months[train_indices18, ]
train_data19 <- filtered_data19_months[train_indices19, ]
train_data20 <- filtered_data20_months[train_indices20, ]
# train_data21 <- filtered_data21_months[train_indices21, ]
# train_data22 <- filtered_data22_months[train_indices22, ]
# train_data23 <- filtered_data23_months[train_indices23, ]
# train_data24 <- filtered_data24_months[train_indices24, ]
test_data1 <- filtered_data1_slope[-train_indices1, ]
test_data2 <- filtered_data2_slope[-train_indices2, ]
test_data3 <- filtered_data3_slope[-train_indices3, ]
test_data4 <- filtered_data4_slope[-train_indices4, ]
# test_data5 <- filtered_data5_slope[-train_indices5, ]
# test_data6 <- filtered_data6_slope[-train_indices6, ]
# test_data7 <- filtered_data7_slope[-train_indices7, ]
# test_data8 <- filtered_data8_slope[-train_indices8, ]
test_data9 <- filtered_data9_threshold[-train_indices9, ]
test_data10 <- filtered_data10_threshold[-train_indices10, ]
test_data11 <- filtered_data11_threshold[-train_indices11, ]
test_data12 <- filtered_data12_threshold[-train_indices12, ]
# test_data13 <- filtered_data13_threshold[-train_indices13, ]
# test_data14 <- filtered_data14_threshold[-train_indices14, ]
# test_data15 <- filtered_data15_threshold[-train_indices15, ]
# test_data16 <- filtered_data16_threshold[-train_indices16, ]
test_data17 <- filtered_data17_months[-train_indices17, ]
test_data18 <- filtered_data18_months[-train_indices18, ]
test_data19 <- filtered_data19_months[-train_indices19, ]
test_data20 <- filtered_data20_months[-train_indices20, ]
# test_data21 <- filtered_data21_months[-train_indices21, ]
# test_data22 <- filtered_data22_months[-train_indices22, ]
# test_data23 <- filtered_data23_months[-train_indices23, ]
# test_data24 <- filtered_data24_months[-train_indices24, ]
print(rf1$time)
user system elapsed
47.47 4.10 108.93
print(rf2$time)
user system elapsed
18.67 0.86 40.32
print(rf3$time)
user system elapsed
50.26 4.05 104.86
print(rf4$time)
user system elapsed
18.94 0.89 39.73
# print(rf5$time)
# print(rf6$time)
# print(rf7$time)
# print(rf8$time)
print(rf9$time)
user system elapsed
26.03 1.33 56.89
print(rf10$time)
user system elapsed
18.11 0.69 38.53
print(rf11$time)
user system elapsed
31.43 2.08 60.69
print(rf12$time)
user system elapsed
17.40 0.54 37.14
# print(rf13$time)
# print(rf14$time)
# print(rf15$time)
# print(rf16$time)
print(rf17$time)
user system elapsed
41.08 3.07 82.16
print(rf18$time)
user system elapsed
20.60 0.69 43.55
print(rf19$time)
user system elapsed
33.16 2.61 68.91
print(rf20$time)
user system elapsed
67.74 1.56 68.28
# print(rf21$time)
# print(rf22$time)
# print(rf23$time)
# print(rf24$time)
# 1: Slope method, no previous validation, GPS points
confusionMatrix(predictions_rf1, test_data1$EUNISa_1)
Confusion Matrix and Statistics
Reference
Prediction Q R S T
Q 1067 344 184 13
R 522 2533 326 69
S 207 172 848 16
T 8 27 10 91
Overall Statistics
Accuracy : 0.7051
95% CI : (0.6938, 0.7163)
No Information Rate : 0.4779
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.5336
Mcnemar's Test P-Value : < 2.2e-16
Statistics by Class:
Class: Q Class: R Class: S Class: T
Sensitivity 0.5915 0.8235 0.6199 0.48148
Specificity 0.8832 0.7272 0.9221 0.99280
Pos Pred Value 0.6636 0.7342 0.6822 0.66912
Neg Pred Value 0.8474 0.8182 0.8999 0.98445
Prevalence 0.2803 0.4779 0.2125 0.02936
Detection Rate 0.1658 0.3935 0.1317 0.01414
Detection Prevalence 0.2498 0.5360 0.1931 0.02113
Balanced Accuracy 0.7373 0.7753 0.7710 0.73714
# 2: Slope method, no previous validation, GPS diff points
confusionMatrix(predictions_rf2, test_data2$EUNISa_1)
Confusion Matrix and Statistics
Reference
Prediction Q R S T
Q 2 0 1 1
R 70 942 66 63
S 1 0 0 0
T 0 5 1 2
Overall Statistics
Accuracy : 0.8198
95% CI : (0.7963, 0.8415)
No Information Rate : 0.8206
P-Value [Acc > NIR] : 0.549
Kappa : 0.041
Mcnemar's Test P-Value : <2e-16
Statistics by Class:
Class: Q Class: R Class: S Class: T
Sensitivity 0.027397 0.99472 0.0000000 0.030303
Specificity 0.998150 0.03865 0.9990792 0.994485
Pos Pred Value 0.500000 0.82559 0.0000000 0.250000
Neg Pred Value 0.938261 0.61538 0.9410234 0.944154
Prevalence 0.063258 0.82062 0.0589255 0.057192
Detection Rate 0.001733 0.81629 0.0000000 0.001733
Detection Prevalence 0.003466 0.98873 0.0008666 0.006932
Balanced Accuracy 0.512774 0.51668 0.4995396 0.512394
# 3: Slope method, rough validation, GPS points
confusionMatrix(predictions_rf3, test_data3$EUNISa_1)
Confusion Matrix and Statistics
Reference
Prediction Q R S T
Q 979 336 192 1
R 497 2441 249 2
S 184 133 758 0
T 0 0 0 126
Overall Statistics
Accuracy : 0.7297
95% CI : (0.7182, 0.741)
No Information Rate : 0.4934
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.5667
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: Q Class: R Class: S Class: T
Sensitivity 0.5898 0.8388 0.6322 0.97674
Specificity 0.8752 0.7497 0.9325 1.00000
Pos Pred Value 0.6492 0.7654 0.7051 1.00000
Neg Pred Value 0.8449 0.8269 0.9086 0.99948
Prevalence 0.2815 0.4934 0.2033 0.02187
Detection Rate 0.1660 0.4139 0.1285 0.02136
Detection Prevalence 0.2557 0.5407 0.1823 0.02136
Balanced Accuracy 0.7325 0.7942 0.7824 0.98837
# 4: Slope method, rough validation, GPS diff points
confusionMatrix(predictions_rf4, test_data4$EUNISa_1)
Confusion Matrix and Statistics
Reference
Prediction Q R S T
Q 0 0 0 0
R 67 876 50 2
S 0 0 0 0
T 0 0 1 20
Overall Statistics
Accuracy : 0.8819
95% CI : (0.8604, 0.9011)
No Information Rate : 0.8622
P-Value [Acc > NIR] : 0.0359
Kappa : 0.2388
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: Q Class: R Class: S Class: T
Sensitivity 0.00000 1.0000 0.0000 0.90909
Specificity 1.00000 0.1500 1.0000 0.99899
Pos Pred Value NaN 0.8804 NaN 0.95238
Neg Pred Value 0.93406 1.0000 0.9498 0.99799
Prevalence 0.06594 0.8622 0.0502 0.02165
Detection Rate 0.00000 0.8622 0.0000 0.01969
Detection Prevalence 0.00000 0.9793 0.0000 0.02067
Balanced Accuracy 0.50000 0.5750 0.5000 0.95404
# 5: Slope method, rough validation + refinement 10-90th, GPS points
# confusionMatrix(predictions_rf5, test_data5$EUNISa_1)
# 6: Slope method, rough validation + refinement 10-90th, GPS diff points
# confusionMatrix(predictions_rf6, test_data6$EUNISa_1)
# 7: Slope method, rough validation + refinement 20-80th, GPS points
# confusionMatrix(predictions_rf7, test_data7$EUNISa_1)
# 8: Slope method, rough validation + refinement 20-80th, GPS diff points
# confusionMatrix(predictions_rf8, test_data8$EUNISa_1)
# 9: Threshold method, no previous validation, GPS points
confusionMatrix(predictions_rf9, test_data9$EUNISa_1)
Confusion Matrix and Statistics
Reference
Prediction Q R S T
Q 429 112 69 6
R 223 1161 123 21
S 84 67 322 2
T 4 7 6 31
Overall Statistics
Accuracy : 0.7285
95% CI : (0.7112, 0.7453)
No Information Rate : 0.5051
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.556
Mcnemar's Test P-Value : 6.417e-12
Statistics by Class:
Class: Q Class: R Class: S Class: T
Sensitivity 0.5797 0.8619 0.6192 0.51667
Specificity 0.9030 0.7220 0.9287 0.99348
Pos Pred Value 0.6964 0.7598 0.6779 0.64583
Neg Pred Value 0.8484 0.8367 0.9097 0.98893
Prevalence 0.2775 0.5051 0.1950 0.02250
Detection Rate 0.1609 0.4353 0.1207 0.01162
Detection Prevalence 0.2310 0.5729 0.1781 0.01800
Balanced Accuracy 0.7413 0.7919 0.7740 0.75507
# 10: Threshold method, no previous validation, GPS diff points
confusionMatrix(predictions_rf10, test_data10$EUNISa_1)
Confusion Matrix and Statistics
Reference
Prediction Q R S T
Q 1 0 0 1
R 31 361 29 20
S 0 0 0 0
T 2 1 0 0
Overall Statistics
Accuracy : 0.8117
95% CI : (0.7722, 0.8469)
No Information Rate : 0.8117
P-Value [Acc > NIR] : 0.5291
Kappa : 0.0429
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: Q Class: R Class: S Class: T
Sensitivity 0.029412 0.99724 0.00000 0.000000
Specificity 0.997573 0.04762 1.00000 0.992941
Pos Pred Value 0.500000 0.81859 NaN 0.000000
Neg Pred Value 0.925676 0.80000 0.93498 0.952596
Prevalence 0.076233 0.81166 0.06502 0.047085
Detection Rate 0.002242 0.80942 0.00000 0.000000
Detection Prevalence 0.004484 0.98879 0.00000 0.006726
Balanced Accuracy 0.513492 0.52243 0.50000 0.496471
# 11: Threshold method, rough validation, GPS points
confusionMatrix(predictions_rf11, test_data11$EUNISa_1)
Confusion Matrix and Statistics
Reference
Prediction Q R S T
Q 422 154 75 0
R 174 1060 98 1
S 72 71 298 0
T 0 0 0 38
Overall Statistics
Accuracy : 0.7381
95% CI : (0.7203, 0.7554)
No Information Rate : 0.5217
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.5717
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: Q Class: R Class: S Class: T
Sensitivity 0.6317 0.8249 0.6327 0.97436
Specificity 0.8724 0.7683 0.9282 1.00000
Pos Pred Value 0.6482 0.7952 0.6757 1.00000
Neg Pred Value 0.8642 0.8009 0.9144 0.99959
Prevalence 0.2712 0.5217 0.1912 0.01583
Detection Rate 0.1713 0.4304 0.1210 0.01543
Detection Prevalence 0.2643 0.5412 0.1790 0.01543
Balanced Accuracy 0.7521 0.7966 0.7805 0.98718
# 12: Threshold method, rough validation, GPS diff points
confusionMatrix(predictions_rf12, test_data12$EUNISa_1)
Confusion Matrix and Statistics
Reference
Prediction Q R S T
Q 0 0 0 0
R 30 338 15 0
S 0 0 0 0
T 0 0 0 9
Overall Statistics
Accuracy : 0.8852
95% CI : (0.8494, 0.915)
No Information Rate : 0.8622
P-Value [Acc > NIR] : 0.1044
Kappa : 0.2689
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: Q Class: R Class: S Class: T
Sensitivity 0.00000 1.0000 0.00000 1.00000
Specificity 1.00000 0.1667 1.00000 1.00000
Pos Pred Value NaN 0.8825 NaN 1.00000
Neg Pred Value 0.92347 1.0000 0.96173 1.00000
Prevalence 0.07653 0.8622 0.03827 0.02296
Detection Rate 0.00000 0.8622 0.00000 0.02296
Detection Prevalence 0.00000 0.9770 0.00000 0.02296
Balanced Accuracy 0.50000 0.5833 0.50000 1.00000
# 13: Threshold method, rough validation + refinement 10-90th, GPS points
# confusionMatrix(predictions_rf13, test_data13$EUNISa_1)
# 14: Threshold method, rough validation + refinement 10-90th, GPS diff points
# confusionMatrix(predictions_rf14, test_data14$EUNISa_1)
# 15: Threshold method, rough validation + refinement 20-80th, GPS points
# confusionMatrix(predictions_rf15, test_data15$EUNISa_1)
# 16: Threshold method, rough validation + refinement 20-80th, GPS diff points
# confusionMatrix(predictions_rf16, test_data16$EUNISa_1)
# 17: Months method, no previous validation, GPS points
confusionMatrix(predictions_rf17, test_data17$EUNISa_1)
Confusion Matrix and Statistics
Reference
Prediction Q R S T
Q 1011 254 116 8
R 319 2257 236 62
S 162 126 827 19
T 7 16 13 109
Overall Statistics
Accuracy : 0.7586
95% CI : (0.7471, 0.7698)
No Information Rate : 0.4787
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.6222
Mcnemar's Test P-Value : 1.689e-14
Statistics by Class:
Class: Q Class: R Class: S Class: T
Sensitivity 0.6744 0.8507 0.6938 0.55051
Specificity 0.9065 0.7864 0.9294 0.99326
Pos Pred Value 0.7279 0.7853 0.7293 0.75172
Neg Pred Value 0.8825 0.8516 0.9172 0.98351
Prevalence 0.2705 0.4787 0.2151 0.03573
Detection Rate 0.1824 0.4073 0.1492 0.01967
Detection Prevalence 0.2506 0.5186 0.2046 0.02616
Balanced Accuracy 0.7905 0.8186 0.8116 0.77188
# 18: Months method, no previous validation, GPS diff points
confusionMatrix(predictions_rf18, test_data18$EUNISa_1)
Confusion Matrix and Statistics
Reference
Prediction Q R S T
Q 1 3 1 4
R 75 815 57 46
S 0 1 0 0
T 3 6 1 9
Overall Statistics
Accuracy : 0.8072
95% CI : (0.7817, 0.831)
No Information Rate : 0.8072
P-Value [Acc > NIR] : 0.519
Kappa : 0.0986
Mcnemar's Test P-Value : <2e-16
Statistics by Class:
Class: Q Class: R Class: S Class: T
Sensitivity 0.0126582 0.98788 0.0000000 0.152542
Specificity 0.9915164 0.09645 0.9989616 0.989616
Pos Pred Value 0.1111111 0.82075 0.0000000 0.473684
Neg Pred Value 0.9230010 0.65517 0.9422135 0.950150
Prevalence 0.0772994 0.80724 0.0577299 0.057730
Detection Rate 0.0009785 0.79746 0.0000000 0.008806
Detection Prevalence 0.0088063 0.97162 0.0009785 0.018591
Balanced Accuracy 0.5020873 0.54216 0.4994808 0.571079
# 19: Months method, rough validation, GPS points
confusionMatrix(predictions_rf19, test_data19$EUNISa_1)
Confusion Matrix and Statistics
Reference
Prediction Q R S T
Q 1012 219 93 0
R 271 2156 167 1
S 116 112 762 0
T 0 0 0 164
Overall Statistics
Accuracy : 0.807
95% CI : (0.7959, 0.8178)
No Information Rate : 0.4902
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.697
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: Q Class: R Class: S Class: T
Sensitivity 0.7234 0.8669 0.7456 0.99394
Specificity 0.9151 0.8302 0.9437 1.00000
Pos Pred Value 0.7644 0.8308 0.7697 1.00000
Neg Pred Value 0.8968 0.8664 0.9363 0.99980
Prevalence 0.2758 0.4902 0.2015 0.03253
Detection Rate 0.1995 0.4250 0.1502 0.03233
Detection Prevalence 0.2610 0.5115 0.1952 0.03233
Balanced Accuracy 0.8192 0.8486 0.8447 0.99697
# 20: Months method, rough validation, GPS diff points
confusionMatrix(predictions_rf20, test_data20$EUNISa_1)
Confusion Matrix and Statistics
Reference
Prediction Q R S T
Q 0 1 0 0
R 52 777 38 13
S 0 0 0 0
T 3 5 1 6
Overall Statistics
Accuracy : 0.8739
95% CI : (0.8504, 0.8949)
No Information Rate : 0.8739
P-Value [Acc > NIR] : 0.525
Kappa : 0.1074
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: Q Class: R Class: S Class: T
Sensitivity 0.000000 0.9923 0.00000 0.315789
Specificity 0.998811 0.0885 1.00000 0.989738
Pos Pred Value 0.000000 0.8830 NaN 0.400000
Neg Pred Value 0.938547 0.6250 0.95647 0.985244
Prevalence 0.061384 0.8739 0.04353 0.021205
Detection Rate 0.000000 0.8672 0.00000 0.006696
Detection Prevalence 0.001116 0.9821 0.00000 0.016741
Balanced Accuracy 0.499405 0.5404 0.50000 0.652764
# 21: Months method, rough validation + refinement 10-90th, GPS points
# confusionMatrix(predictions_rf21, test_data21$EUNISa_1)
# 22: Months method, rough validation + refinement 10-90th, GPS diff points
# confusionMatrix(predictions_rf22, test_data22$EUNISa_1)
# 23: Months method, rough validation + refinement 20-80th, GPS points
# confusionMatrix(predictions_rf23, test_data23$EUNISa_1)
# 24: Months method, rough validation + refinement 20-80th, GPS diff points
# confusionMatrix(predictions_rf24, test_data24$EUNISa_1)
cm_rf19<- as.data.frame(as.table(confusionMatrix(predictions_rf19,
test_data19$EUNISa_1)))
colnames(cm_rf19) <- c("Prediction", "Reference", "Freq")
plot_cm_rf19 <- ggplot(cm_rf19,
aes(x = Reference, y = Prediction, fill = Freq)) +
geom_tile(color = "white") +
geom_text(aes(label = Freq), color = "black", size = 5) +
scale_fill_gradient(low = "white", high = "steelblue") +
labs(title = "Confusion Matrix", x = "Reference", y = "Prediction") +
theme_minimal() + theme(legend.position = "none") +
theme(axis.text = element_text(size = 12),
axis.title = element_text(size = 14),
plot.title = element_text(size = 16, face = "bold"))
plot_cm_rf19
ggsave(here("output", "figures", "RF", "plot_cm_rf19.jpeg"), plot_cm_rf19,
dpi = 300, width = 3, height = 3)
plot(varimp_rf1$varimp, margin = c(10, 6, 2, 2))
plot(varimp_rf2$varimp, margin = c(10, 6, 2, 2))
plot(varimp_rf3$varimp, margin = c(10, 6, 2, 2))
plot(varimp_rf4$varimp, margin = c(10, 6, 2, 2))
# plot(varimp_rf5$varimp, margin = c(10, 6, 2, 2))
# plot(varimp_rf6$varimp, margin = c(10, 6, 2, 2))
# plot(varimp_rf7$varimp, margin = c(10, 6, 2, 2))
# plot(varimp_rf8$varimp, margin = c(10, 6, 2, 2))
plot(varimp_rf9$varimp, margin = c(10, 6, 2, 2))
plot(varimp_rf10$varimp, margin = c(10, 6, 2, 2))
plot(varimp_rf11$varimp, margin = c(10, 6, 2, 2))
plot(varimp_rf12$varimp, margin = c(10, 6, 2, 2))
# plot(varimp_rf13$varimp, margin = c(10, 6, 2, 2))
# plot(varimp_rf14$varimp, margin = c(10, 6, 2, 2))
# plot(varimp_rf15$varimp, margin = c(10, 6, 2, 2))
# plot(varimp_rf16$varimp, margin = c(10, 6, 2, 2))
plot(varimp_rf17$varimp, margin = c(10, 6, 2, 2))
plot(varimp_rf18$varimp, margin = c(10, 6, 2, 2))
plot(varimp_rf19$varimp, margin = c(10, 6, 2, 2))
plot(varimp_rf20$varimp, margin = c(10, 6, 2, 2))
# plot(varimp_rf21$varimp, margin = c(10, 6, 2, 2))
# plot(varimp_rf22$varimp, margin = c(10, 6, 2, 2))
# plot(varimp_rf23$varimp, margin = c(10, 6, 2, 2))
# plot(varimp_rf24$varimp, margin = c(10, 6, 2, 2))
plot(varimp-cond_rf1$varimp, margin = c(9, 3, 2, 2))
G1;Error en h(simpleError(msg, call)):
error al evaluar el argumento 'x' al seleccionar un método para la función 'plot': objeto 'cond_rf1' no encontrado
g
plot(varimp-cond_rf2$varimp, margin = c(9, 3, 2, 2))
G1;Error en h(simpleError(msg, call)):
error al evaluar el argumento 'x' al seleccionar un método para la función 'plot': objeto 'cond_rf2' no encontrado
g
plot(varimp-cond_rf3$varimp, margin = c(9, 3, 2, 2))
G1;Error en h(simpleError(msg, call)):
error al evaluar el argumento 'x' al seleccionar un método para la función 'plot': objeto 'cond_rf3' no encontrado
g
plot(varimp-cond_rf4$varimp, margin = c(9, 3, 2, 2))
G1;Error en h(simpleError(msg, call)):
error al evaluar el argumento 'x' al seleccionar un método para la función 'plot': objeto 'cond_rf4' no encontrado
g
# plot(varimp-cond_rf5$varimp, margin = c(9, 3, 2, 2))
# plot(varimp-cond_rf6$varimp, margin = c(9, 3, 2, 2))
# plot(varimp-cond_rf7$varimp, margin = c(9, 3, 2, 2))
# plot(varimp-cond_rf8$varimp, margin = c(9, 3, 2, 2))
plot(varimp-cond_rf9$varimp, margin = c(9, 3, 2, 2))
G1;Error en h(simpleError(msg, call)):
error al evaluar el argumento 'x' al seleccionar un método para la función 'plot': objeto 'cond_rf9' no encontrado
g
plot(varimp-cond_rf10$varimp, margin = c(9, 3, 2, 2))
G1;Error en h(simpleError(msg, call)):
error al evaluar el argumento 'x' al seleccionar un método para la función 'plot': objeto 'cond_rf10' no encontrado
g
plot(varimp-cond_rf11$varimp, margin = c(9, 3, 2, 2))
G1;Error en h(simpleError(msg, call)):
error al evaluar el argumento 'x' al seleccionar un método para la función 'plot': objeto 'cond_rf11' no encontrado
g
plot(varimp-cond_rf12$varimp, margin = c(9, 3, 2, 2))
G1;Error en h(simpleError(msg, call)):
error al evaluar el argumento 'x' al seleccionar un método para la función 'plot': objeto 'cond_rf12' no encontrado
g
# plot(varimp-cond_rf13$varimp, margin = c(9, 3, 2, 2))
# plot(varimp-cond_rf14$varimp, margin = c(9, 3, 2, 2))
# plot(varimp-cond_rf15$varimp, margin = c(9, 3, 2, 2))
# plot(varimp-cond_rf16$varimp, margin = c(9, 3, 2, 2))
plot(varimp-cond_rf17$varimp, margin = c(9, 3, 2, 2))
G1;Error en h(simpleError(msg, call)):
error al evaluar el argumento 'x' al seleccionar un método para la función 'plot': objeto 'cond_rf17' no encontrado
g
plot(varimp-cond_rf18$varimp, margin = c(9, 3, 2, 2))
G1;Error en h(simpleError(msg, call)):
error al evaluar el argumento 'x' al seleccionar un método para la función 'plot': objeto 'cond_rf18' no encontrado
g
plot(varimp-cond_rf19$varimp, margin = c(9, 3, 2, 2))
G1;Error en h(simpleError(msg, call)):
error al evaluar el argumento 'x' al seleccionar un método para la función 'plot': objeto 'cond_rf19' no encontrado
g
plot(varimp-cond_rf20$varimp, margin = c(9, 3, 2, 2))
G1;Error en h(simpleError(msg, call)):
error al evaluar el argumento 'x' al seleccionar un método para la función 'plot': objeto 'cond_rf20' no encontrado
g
# plot(varimp-cond_rf21$varimp, margin = c(9, 3, 2, 2))
# plot(varimp-cond_rf22$varimp, margin = c(9, 3, 2, 2))
# plot(varimp-cond_rf23$varimp, margin = c(9, 3, 2, 2))
# plot(varimp-cond_rf24$varimp, margin = c(9, 3, 2, 2))
print(roc_data1$time)
G1;Error en h(simpleError(msg, call)):
error al evaluar el argumento 'x' al seleccionar un método para la función 'print': objeto 'roc_data1' no encontrado
g
print(roc_data2$time)
G1;Error en h(simpleError(msg, call)):
error al evaluar el argumento 'x' al seleccionar un método para la función 'print': objeto 'roc_data2' no encontrado
g
print(roc_data3$time)
G1;Error en h(simpleError(msg, call)):
error al evaluar el argumento 'x' al seleccionar un método para la función 'print': objeto 'roc_data3' no encontrado
g
print(roc_data4$time)
G1;Error en h(simpleError(msg, call)):
error al evaluar el argumento 'x' al seleccionar un método para la función 'print': objeto 'roc_data4' no encontrado
g
# print(roc_data5$time)
# print(roc_data6$time)
# print(roc_data7$time)
# print(roc_data8$time)
print(roc_data9$time)
G1;Error en h(simpleError(msg, call)):
error al evaluar el argumento 'x' al seleccionar un método para la función 'print': objeto 'roc_data9' no encontrado
g
print(roc_data10$time)
G1;Error en h(simpleError(msg, call)):
error al evaluar el argumento 'x' al seleccionar un método para la función 'print': objeto 'roc_data10' no encontrado
g
print(roc_data11$time)
G1;Error en h(simpleError(msg, call)):
error al evaluar el argumento 'x' al seleccionar un método para la función 'print': objeto 'roc_data11' no encontrado
g
print(roc_data12$time)
G1;Error en h(simpleError(msg, call)):
error al evaluar el argumento 'x' al seleccionar un método para la función 'print': objeto 'roc_data12' no encontrado
g
# print(roc_data13$time)
# print(roc_data14$time)
# print(roc_data15$time)
# print(roc_data16$time)
print(roc_data17$time)
G1;Error en h(simpleError(msg, call)):
error al evaluar el argumento 'x' al seleccionar un método para la función 'print': objeto 'roc_data17' no encontrado
g
print(roc_data18$time)
G1;Error en h(simpleError(msg, call)):
error al evaluar el argumento 'x' al seleccionar un método para la función 'print': objeto 'roc_data18' no encontrado
g
print(roc_data19$time)
G1;Error en h(simpleError(msg, call)):
error al evaluar el argumento 'x' al seleccionar un método para la función 'print': objeto 'roc_data19' no encontrado
g
print(roc_data20$time)
G1;Error en h(simpleError(msg, call)):
error al evaluar el argumento 'x' al seleccionar un método para la función 'print': objeto 'roc_data20' no encontrado
g
# print(roc_data21$time)
# print(roc_data22$time)
# print(roc_data23$time)
# print(roc_data24$time)
save(roc_data1,
file = "objects/RF/rf1_rocdata_l1_slope_novalid_GPS_S2.Rdata")
G1;Error en save(roc_data1, file = "objects/RF/rf1_rocdata_l1_slope_novalid_GPS_S2.Rdata"):
object ‘roc_data1’ not found
g
roc1 <- ggplot(roc_data1$roc, aes(x = FPR, y = TPR, color = Class)) +
geom_line(size = 1.2) + geom_abline(linetype = "dashed", color = "gray") +
labs(title = "Multiclass ROC Curves with AUC", x = "False Positive Rate",
y = "True Positive Rate", color = "Class (AUC)") +
theme_minimal() + theme(legend.position = "bottom")
roc2 <- ggplot(roc_data2$roc, aes(x = FPR, y = TPR, color = Class)) +
geom_line(size = 1.2) + geom_abline(linetype = "dashed", color = "gray") +
labs(title = "Multiclass ROC Curves with AUC", x = "False Positive Rate",
y = "True Positive Rate", color = "Class (AUC)") +
theme_minimal() + theme(legend.position = "bottom")
roc3 <- ggplot(roc_data3$roc, aes(x = FPR, y = TPR, color = Class)) +
geom_line(size = 1.2) + geom_abline(linetype = "dashed", color = "gray") +
labs(title = "Multiclass ROC Curves with AUC", x = "False Positive Rate",
y = "True Positive Rate", color = "Class (AUC)") +
theme_minimal() + theme(legend.position = "bottom")
roc4 <- ggplot(roc_data4$roc, aes(x = FPR, y = TPR, color = Class)) +
geom_line(size = 1.2) + geom_abline(linetype = "dashed", color = "gray") +
labs(title = "Multiclass ROC Curves with AUC", x = "False Positive Rate",
y = "True Positive Rate", color = "Class (AUC)") +
theme_minimal() + theme(legend.position = "bottom")
# roc5 <- ggplot(roc_data5$roc, aes(x = FPR, y = TPR, color = Class)) +
# geom_line(size = 1.2) + geom_abline(linetype = "dashed", color = "gray") +
# labs(title = "Multiclass ROC Curves with AUC", x = "False Positive Rate",
# y = "True Positive Rate", color = "Class (AUC)") +
# theme_minimal() + theme(legend.position = "bottom")
# roc6 <- ggplot(roc_data6$roc, aes(x = FPR, y = TPR, color = Class)) +
# geom_line(size = 1.2) + geom_abline(linetype = "dashed", color = "gray") +
# labs(title = "Multiclass ROC Curves with AUC", x = "False Positive Rate",
# y = "True Positive Rate", color = "Class (AUC)") +
# theme_minimal() + theme(legend.position = "bottom")
# roc7 <- ggplot(roc_data7$roc, aes(x = FPR, y = TPR, color = Class)) +
# geom_line(size = 1.2) + geom_abline(linetype = "dashed", color = "gray") +
# labs(title = "Multiclass ROC Curves with AUC", x = "False Positive Rate",
# y = "True Positive Rate", color = "Class (AUC)") +
# theme_minimal() + theme(legend.position = "bottom")
# roc8 <- ggplot(roc_data8$roc, aes(x = FPR, y = TPR, color = Class)) +
# geom_line(size = 1.2) + geom_abline(linetype = "dashed", color = "gray") +
# labs(title = "Multiclass ROC Curves with AUC", x = "False Positive Rate",
# y = "True Positive Rate", color = "Class (AUC)") +
# theme_minimal() + theme(legend.position = "bottom")
roc9 <- ggplot(roc_data9$roc, aes(x = FPR, y = TPR, color = Class)) +
geom_line(size = 1.2) + geom_abline(linetype = "dashed", color = "gray") +
labs(title = "Multiclass ROC Curves with AUC", x = "False Positive Rate",
y = "True Positive Rate", color = "Class (AUC)") +
theme_minimal() + theme(legend.position = "bottom")
roc10 <- ggplot(roc_data10$roc, aes(x = FPR, y = TPR, color = Class)) +
geom_line(size = 1.2) + geom_abline(linetype = "dashed", color = "gray") +
labs(title = "Multiclass ROC Curves with AUC", x = "False Positive Rate",
y = "True Positive Rate", color = "Class (AUC)") +
theme_minimal() + theme(legend.position = "bottom")
roc11 <- ggplot(roc_data11$roc, aes(x = FPR, y = TPR, color = Class)) +
geom_line(size = 1.2) + geom_abline(linetype = "dashed", color = "gray") +
labs(title = "Multiclass ROC Curves with AUC", x = "False Positive Rate",
y = "True Positive Rate", color = "Class (AUC)") +
theme_minimal() + theme(legend.position = "bottom")
roc12 <- ggplot(roc_data12$roc, aes(x = FPR, y = TPR, color = Class)) +
geom_line(size = 1.2) + geom_abline(linetype = "dashed", color = "gray") +
labs(title = "Multiclass ROC Curves with AUC", x = "False Positive Rate",
y = "True Positive Rate", color = "Class (AUC)") +
theme_minimal() + theme(legend.position = "bottom")
# roc13 <- ggplot(roc_data13$roc, aes(x = FPR, y = TPR, color = Class)) +
# geom_line(size = 1.2) + geom_abline(linetype = "dashed", color = "gray") +
# labs(title = "Multiclass ROC Curves with AUC", x = "False Positive Rate",
# y = "True Positive Rate", color = "Class (AUC)") +
# theme_minimal() + theme(legend.position = "bottom")
# roc14 <- ggplot(roc_data14$roc, aes(x = FPR, y = TPR, color = Class)) +
# geom_line(size = 1.2) + geom_abline(linetype = "dashed", color = "gray") +
# labs(title = "Multiclass ROC Curves with AUC", x = "False Positive Rate",
# y = "True Positive Rate", color = "Class (AUC)") +
# theme_minimal() + theme(legend.position = "bottom")
# roc15 <- ggplot(roc_data15$roc, aes(x = FPR, y = TPR, color = Class)) +
# geom_line(size = 1.2) + geom_abline(linetype = "dashed", color = "gray") +
# labs(title = "Multiclass ROC Curves with AUC", x = "False Positive Rate",
# y = "True Positive Rate", color = "Class (AUC)") +
# theme_minimal() + theme(legend.position = "bottom")
# roc16 <- ggplot(roc_data16$roc, aes(x = FPR, y = TPR, color = Class)) +
# geom_line(size = 1.2) + geom_abline(linetype = "dashed", color = "gray") +
# labs(title = "Multiclass ROC Curves with AUC", x = "False Positive Rate",
# y = "True Positive Rate", color = "Class (AUC)") +
# theme_minimal() + theme(legend.position = "bottom")
roc17 <- ggplot(roc_data17$roc, aes(x = FPR, y = TPR, color = Class)) +
geom_line(size = 1.2) + geom_abline(linetype = "dashed", color = "gray") +
labs(title = "Multiclass ROC Curves with AUC", x = "False Positive Rate",
y = "True Positive Rate", color = "Class (AUC)") +
theme_minimal() + theme(legend.position = "bottom")
roc18 <- ggplot(roc_data18$roc, aes(x = FPR, y = TPR, color = Class)) +
geom_line(size = 1.2) + geom_abline(linetype = "dashed", color = "gray") +
labs(title = "Multiclass ROC Curves with AUC", x = "False Positive Rate",
y = "True Positive Rate", color = "Class (AUC)") +
theme_minimal() + theme(legend.position = "bottom")
roc19 <- ggplot(roc_data19$roc, aes(x = FPR, y = TPR, color = Class)) +
geom_line(size = 1.2) + geom_abline(linetype = "dashed", color = "gray") +
labs(title = "Multiclass ROC Curves with AUC", x = "False Positive Rate",
y = "True Positive Rate", color = "Class (AUC)") +
theme_minimal() + theme(legend.position = "bottom")
roc20 <- ggplot(roc_data20$roc, aes(x = FPR, y = TPR, color = Class)) +
geom_line(size = 1.2) + geom_abline(linetype = "dashed", color = "gray") +
labs(title = "Multiclass ROC Curves with AUC", x = "False Positive Rate",
y = "True Positive Rate", color = "Class (AUC)") +
theme_minimal() + theme(legend.position = "bottom")
# roc21 <- ggplot(roc_data21$roc, aes(x = FPR, y = TPR, color = Class)) +
# geom_line(size = 1.2) + geom_abline(linetype = "dashed", color = "gray") +
# labs(title = "Multiclass ROC Curves with AUC", x = "False Positive Rate",
# y = "True Positive Rate", color = "Class (AUC)") +
# theme_minimal() + theme(legend.position = "bottom")
# roc22 <- ggplot(roc_data22$roc, aes(x = FPR, y = TPR, color = Class)) +
# geom_line(size = 1.2) + geom_abline(linetype = "dashed", color = "gray") +
# labs(title = "Multiclass ROC Curves with AUC", x = "False Positive Rate",
# y = "True Positive Rate", color = "Class (AUC)") +
# theme_minimal() + theme(legend.position = "bottom")
# roc23 <- ggplot(roc_data23$roc, aes(x = FPR, y = TPR, color = Class)) +
# geom_line(size = 1.2) + geom_abline(linetype = "dashed", color = "gray") +
# labs(title = "Multiclass ROC Curves with AUC", x = "False Positive Rate",
# y = "True Positive Rate", color = "Class (AUC)") +
# theme_minimal() + theme(legend.position = "bottom")
# roc24 <- ggplot(roc_data24$roc, aes(x = FPR, y = TPR, color = Class)) +
# geom_line(size = 1.2) + geom_abline(linetype = "dashed", color = "gray") +
# labs(title = "Multiclass ROC Curves with AUC", x = "False Positive Rate",
# y = "True Positive Rate", color = "Class (AUC)") +
# theme_minimal() + theme(legend.position = "bottom")
roc1
roc2
roc3
roc4
# roc5
# roc6
# roc7
# roc8
roc9
roc10
roc11
roc12
# roc13
# roc14
# roc15
# roc16
roc17
roc18
roc19
roc20
# roc21
# roc22
# roc23
# roc24
Based on the 4 most important variables (except canopy_height). If the same type of variable is within the 4 most important for different indices, take next variable of different type. For rf19 (Months method, rough validation, GPS points), variables for refinement are: EVI_pos_value, SAVI_auc_mar_oct, SAVI_avg_value_03, NDVI_min.
distr_plot_percentiles <- function(data, y_vars, y_labels) {
for (i in seq_along(y_vars)) {
y_var <- y_vars[[i]]
y_label <- y_labels[[i]]
# Calculate percentiles per EUNISa_1 group
percentiles <- data %>%
group_by(EUNISa_1) %>%
summarise(
p10 = quantile(.data[[y_var]], 0.1, na.rm = TRUE),
p90 = quantile(.data[[y_var]], 0.9, na.rm = TRUE),
.groups = "drop"
)
# Join percentiles back to data
data_flagged <- data %>%
left_join(percentiles, by = "EUNISa_1") %>%
mutate(outlier_flag = case_when(
.data[[y_var]] < p10 ~ "low",
.data[[y_var]] > p90 ~ "high",
TRUE ~ "mid"
))
# Filter and plot
p <- ggplot(data = data_flagged %>%
dplyr::filter(EUNISa_1 %in% c("T", "R", "S", "Q")),
aes(x = EUNISa_1_descr, y = .data[[y_var]])) +
geom_flat_violin(aes(fill = EUNISa_1_descr),
position = position_nudge(x = 0.2, y = 0), alpha = 0.8) +
geom_point(aes(color = ifelse(outlier_flag == "mid",
EUNISa_1_descr, "grey")),
position = position_jitter(width = 0.15), size = 1,
alpha = 0.6) +
geom_boxplot(aes(fill = EUNISa_1_descr), width = 0.2, outlier.shape = NA,
alpha = 0.5) +
stat_summary(fun = mean, geom = "point", shape = 20, size = 1) +
stat_summary(fun.data = function(x) data.frame(y = max(x, na.rm = TRUE) +
0.1, label = length(x)),
geom = "text", aes(label = ..label..), vjust = 0.5) +
labs(y = y_label, x = "EUNIS level 1") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
guides(fill = FALSE, color = FALSE) +
theme_bw() + coord_flip() +
scale_color_manual(values = c(
"Forests and other wooded land" = "#F8766D",
"Grasslands" = "#7CAE00",
"Heathlands, scrub and tundra" = "#00BFC4",
"Wetlands" = "#C77CFF",
"grey" = "grey"))
print(p)
}
}
distr_plot_percentiles(
# GPS points after rough validation
filtered_data19_months %>%
# Join to get EUNIS 1 descriptions
left_join(data_validation %>%
select(PlotObservationID, EUNISa_1_descr)),
c("EVI_pos_value", "SAVI_auc_mar_oct", "SAVI_avg_value_03", "NDVI_min"),
c("EVI max value", "SAVI AUC", "AVerage SAVI value for March", "NDVI min value"))
Calculate percentiles:
percentiles_months <-
# GPS points after rough validation
filtered_data19_months %>%
group_by(EUNISa_1) %>%
summarize(
perc_10_SAVI_diff_pos_march_value = quantile(SAVI_diff_pos_march_value,
probs = 0.10, na.rm = T),
perc_20_SAVI_diff_pos_march_value = quantile(SAVI_diff_pos_march_value,
probs = 0.20, na.rm = T),
perc_80_SAVI_diff_pos_march_value = quantile(SAVI_diff_pos_march_value,
probs = 0.80, na.rm = T),
perc_90_SAVI_diff_pos_march_value = quantile(SAVI_diff_pos_march_value,
probs = 0.90, na.rm = T),
perc_10_EVI_auc_mar_oct = quantile(EVI_auc_mar_oct,
probs = 0.10, na.rm = T),
perc_20_EVI_auc_mar_oct = quantile(EVI_auc_mar_oct,
probs = 0.20, na.rm = T),
perc_80_EVI_auc_mar_oct = quantile(EVI_auc_mar_oct,
probs = 0.80, na.rm = T),
perc_90_EVI_auc_mar_oct = quantile(EVI_auc_mar_oct,
probs = 0.90, na.rm = T),
perc_10_SAVI_diff_pos_oct_doy = quantile(SAVI_diff_pos_oct_doy,
probs = 0.10, na.rm = T),
perc_20_SAVI_diff_pos_oct_doy = quantile(SAVI_diff_pos_oct_doy,
probs = 0.20, na.rm = T),
perc_80_SAVI_diff_pos_oct_doy = quantile(SAVI_diff_pos_oct_doy,
probs = 0.80, na.rm = T),
perc_90_SAVI_diff_pos_oct_doy = quantile(SAVI_diff_pos_oct_doy,
probs = 0.90, na.rm = T),
perc_10_EVI_pos_value = quantile(EVI_pos_value,
probs = 0.10, na.rm = T),
perc_20_EVI_pos_value = quantile(EVI_pos_value,
probs = 0.20, na.rm = T),
perc_80_EVI_pos_value = quantile(EVI_pos_value,
probs = 0.80, na.rm = T),
perc_90_EVI_pos_value = quantile(EVI_pos_value,
probs = 0.90, na.rm = T)
)
# 21: Months method, rough validation + refinement 10-90th, GPS points
filtered_data21_months <-
# GPS points after rough validation
filtered_data19_months %>%
left_join(percentiles_months, by = "EUNISa_1") %>%
mutate(EUNISa_1 = as.factor(EUNISa_1)) %>%
dplyr::filter(
(SAVI_diff_pos_march_value >= perc_10_SAVI_diff_pos_march_value &
SAVI_diff_pos_march_value <= perc_90_SAVI_diff_pos_march_value) &
(EVI_auc_mar_oct >= perc_10_EVI_auc_mar_oct &
EVI_auc_mar_oct <= perc_90_EVI_auc_mar_oct) &
(SAVI_diff_pos_oct_doy >= perc_10_SAVI_diff_pos_oct_doy &
SAVI_diff_pos_oct_doy <= perc_90_SAVI_diff_pos_oct_doy) &
(EVI_pos_value >= perc_10_EVI_pos_value &
EVI_pos_value <= perc_90_EVI_pos_value)
)
# 23: Months method, rough validation + refinement 20-80th, GPS points
filtered_data23_months <-
# GPS points after rough validation
filtered_data19_months %>%
left_join(percentiles_months, by = "EUNISa_1") %>%
mutate(EUNISa_1 = as.factor(EUNISa_1)) %>%
dplyr::filter(
(SAVI_diff_pos_march_value >= perc_20_SAVI_diff_pos_march_value &
SAVI_diff_pos_march_value <= perc_80_SAVI_diff_pos_march_value) &
(EVI_auc_mar_oct >= perc_20_EVI_auc_mar_oct &
EVI_auc_mar_oct <= perc_80_EVI_auc_mar_oct) &
(SAVI_diff_pos_oct_doy >= perc_20_SAVI_diff_pos_oct_doy &
SAVI_diff_pos_oct_doy <= perc_80_SAVI_diff_pos_oct_doy) &
(EVI_pos_value >= perc_20_EVI_pos_value &
EVI_pos_value <= perc_80_EVI_pos_value)
)
bind_rows(
filtered_data21_months %>% count(EUNISa_1) %>%
pivot_wider(names_from = EUNISa_1, values_from = n) %>%
mutate(data = "filtered_data21_months") %>%
select(data, Q, R, S, T),
filtered_data23_months %>% count(EUNISa_1) %>%
pivot_wider(names_from = EUNISa_1, values_from = n) %>%
mutate(data = "filtered_data23_months") %>%
select(data, Q, R, S, T),
)
set.seed(123)
train_indices21 <- sample(1:nrow(filtered_data21_months),
0.7 * nrow(filtered_data21_months))
train_indices23 <- sample(1:nrow(filtered_data23_months),
0.7 * nrow(filtered_data23_months))
train_indices21 <- sample(1:nrow(filtered_data21_months),
0.7 * nrow(filtered_data21_months))
train_indices23 <- sample(1:nrow(filtered_data23_months),
0.7 * nrow(filtered_data23_months))
train_data21 <- filtered_data21_months[train_indices21, ]
train_data23 <- filtered_data23_months[train_indices23, ]
print(rf21$time)
print(rf23$time)
# 21: Months method, rough validation + refinement 10-90th, GPS points
confusionMatrix(predictions_rf21, test_data21$EUNISa_1)
# 23: Months method, rough validation + refinement 20-80th, GPS points
confusionMatrix(predictions_rf23, test_data23$EUNISa_1)
cm_rf21<- as.data.frame(as.table(confusionMatrix(predictions_rf21,
test_data21$EUNISa_1)))
colnames(cm_rf21) <- c("Prediction", "Reference", "Freq")
cm_rf23<- as.data.frame(as.table(confusionMatrix(predictions_rf23,
test_data23$EUNISa_1)))
colnames(cm_rf23) <- c("Prediction", "Reference", "Freq")
plot_cm_rf21 <- ggplot(cm_rf21,
aes(x = Reference, y = Prediction, fill = Freq)) +
geom_tile(color = "white") +
geom_text(aes(label = Freq), color = "black", size = 5) +
scale_fill_gradient(low = "white", high = "steelblue") +
labs(title = "Confusion Matrix", x = "Reference", y = "Prediction") +
theme_minimal() + theme(legend.position = "none") +
theme(axis.text = element_text(size = 12),
axis.title = element_text(size = 14),
plot.title = element_text(size = 16, face = "bold"))
plot_cm_rf23 <- ggplot(cm_rf23,
aes(x = Reference, y = Prediction, fill = Freq)) +
geom_tile(color = "white") +
geom_text(aes(label = Freq), color = "black", size = 5) +
scale_fill_gradient(low = "white", high = "steelblue") +
labs(title = "Confusion Matrix", x = "Reference", y = "Prediction") +
theme_minimal() + theme(legend.position = "none") +
theme(axis.text = element_text(size = 12),
axis.title = element_text(size = 14),
plot.title = element_text(size = 16, face = "bold"))
plot_cm_rf21
plot_cm_rf23
ggsave(here("output", "figures", "RF", "plot_cm_rf21.jpeg"), plot_cm_rf21,
dpi = 300, width = 3, height = 3)
ggsave(here("output", "figures", "RF", "plot_cm_rf23.jpeg"), plot_cm_rf23,
dpi = 300, width = 3, height = 3)
plot(varimp_rf21$varimp, margin = c(10, 6, 2, 2))
plot(varimp_rf23$varimp, margin = c(10, 6, 2, 2))
Refinement based on the variables: SAVI_pos_value, NDWI_min, NDMI_min, NDMI_max (later check variable importances well!).
Distribution plots:
distr_plot_percentiles <- function(data, y_vars, y_labels) {
for (i in seq_along(y_vars)) {
y_var <- y_vars[[i]]
y_label <- y_labels[[i]]
# Calculate percentiles per EUNISa_1 group
percentiles <- data %>%
group_by(EUNISa_1) %>%
summarise(
p10 = quantile(.data[[y_var]], 0.1, na.rm = TRUE),
p90 = quantile(.data[[y_var]], 0.9, na.rm = TRUE),
.groups = "drop"
)
# Join percentiles back to data
data_flagged <- data %>%
left_join(percentiles, by = "EUNISa_1") %>%
mutate(outlier_flag = case_when(
.data[[y_var]] < p10 ~ "low",
.data[[y_var]] > p90 ~ "high",
TRUE ~ "mid"
))
# Filter and plot
p <- ggplot(data = data_flagged %>%
filter(EUNISa_1 %in% c("T", "R", "S", "Q")),
aes(x = EUNISa_1_descr, y = .data[[y_var]])) +
geom_flat_violin(aes(fill = EUNISa_1_descr),
position = position_nudge(x = 0.2, y = 0), alpha = 0.8) +
geom_point(aes(color = ifelse(outlier_flag == "mid",
EUNISa_1_descr, "grey")),
position = position_jitter(width = 0.15), size = 1,
alpha = 0.6) +
geom_boxplot(aes(fill = EUNISa_1_descr), width = 0.2, outlier.shape = NA,
alpha = 0.5) +
stat_summary(fun = mean, geom = "point", shape = 20, size = 1) +
stat_summary(fun.data = function(x) data.frame(y = max(x, na.rm = TRUE) +
0.1, label = length(x)),
geom = "text", aes(label = ..label..), vjust = 0.5) +
labs(y = y_label, x = "EUNIS level 1") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
guides(fill = FALSE, color = FALSE) +
theme_bw() + coord_flip()
print(p)
}
}
distr_plot_percentiles(data_validation %>%
# Get GPS points after rough validation
filter(PlotObservationID %in% filtered_data3$PlotObservationID),
c("canopy_height", "SAVI_pos_value", "NDWI_min", "NDMI_min",
"NDMI_max"),
c("canopy_height", "SAVI_pos_value", "NDWI_min", "NDMI_min",
"NDMI_max"))
So far not using canopy_height for refinement.
Calculate percentiles:
percentiles6 <- data_validation %>%
# Get GPS points after rough validation
filter(PlotObservationID %in% filtered_data3$PlotObservationID) %>%
group_by(EUNISa_1) %>%
summarize(
percentile_10_SAVI_pos_value = quantile(SAVI_pos_value,
probs = 0.10, na.rm = T),
percentile_20_SAVI_pos_value = quantile(SAVI_pos_value,
probs = 0.20, na.rm = T),
percentile_80_SAVI_pos_value = quantile(SAVI_pos_value,
probs = 0.80, na.rm = T),
percentile_90_SAVI_pos_value = quantile(SAVI_pos_value,
probs = 0.90, na.rm = T),
percentile_10_NDWI_min = quantile(NDWI_min,
probs = 0.10, na.rm = T),
percentile_20_NDWI_min = quantile(NDWI_min,
probs = 0.20, na.rm = T),
percentile_80_NDWI_min = quantile(NDWI_min,
probs = 0.80, na.rm = T),
percentile_90_NDWI_min = quantile(NDWI_min,
probs = 0.90, na.rm = T),
percentile_10_NDMI_min = quantile(NDMI_min,
probs = 0.10, na.rm = T),
percentile_20_NDMI_min = quantile(NDMI_min,
probs = 0.20, na.rm = T),
percentile_80_NDMI_min = quantile(NDMI_min,
probs = 0.80, na.rm = T),
percentile_90_NDMI_min = quantile(NDMI_min,
probs = 0.90, na.rm = T),
percentile_10_NDMI_max = quantile(NDMI_max,
probs = 0.10, na.rm = T),
percentile_20_NDMI_max = quantile(NDMI_max,
probs = 0.20, na.rm = T),
percentile_80_NDMI_max = quantile(NDMI_max,
probs = 0.80, na.rm = T),
percentile_90_NDMI_max = quantile(NDMI_max,
probs = 0.90, na.rm = T)
)
filtered_data6 <- data_validation %>%
# Get GPS points after rough validation
filter(PlotObservationID %in% filtered_data3$PlotObservationID) %>%
select(PlotObservationID, EUNISa_1, all_of(vars_RF)) %>%
left_join(percentiles6, by = "EUNISa_1") %>%
mutate(EUNISa_1 = as.factor(EUNISa_1)) %>%
filter(
(SAVI_pos_value >= percentile_10_SAVI_pos_value &
SAVI_pos_value <= percentile_90_SAVI_pos_value) &
(NDWI_min >= percentile_10_NDWI_min &
NDWI_min <= percentile_90_NDWI_min) &
(NDMI_min >= percentile_10_NDMI_min &
NDMI_min <= percentile_90_NDMI_min) &
(NDMI_max >= percentile_10_NDMI_max &
NDMI_max <= percentile_90_NDMI_max)
)
Split into training and test data sets.
set.seed(123)
train_indices6 <- sample(1:nrow(filtered_data6), 0.7 * nrow(filtered_data6))
train_data6 <- filtered_data6[train_indices6, ]
test_data6 <- filtered_data6[-train_indices6, ]
Number of points per category for filtered data:
filtered_data6 %>% count(EUNISa_1)
Confusion matrix:
confusionMatrix(predictions_rf6_S2, test_data6$EUNISa_1)
Variable Importance Plot
plot(varimp_rf6_S2$varimp, margin = c(9, 3, 2, 2))
plot(varimp_rf6_cond_S2$varimp, margin = c(9, 3, 2, 2))
ROC curves:
roc6 <- ggplot(roc_data6_S2$roc, aes(x = FPR, y = TPR, color = Class)) +
geom_line(size = 1.2) +
geom_abline(linetype = "dashed", color = "gray") +
labs(
title = "Multiclass ROC Curves with AUC",
x = "False Positive Rate",
y = "True Positive Rate",
color = "Class (AUC)"
) +
theme_minimal() +
theme(legend.position = "bottom")
roc6
filtered_data7 <- data_validation %>%
# Get GPS points after rough validation
filter(PlotObservationID %in% filtered_data3$PlotObservationID) %>%
select(PlotObservationID, EUNISa_1, all_of(vars_RF)) %>%
left_join(percentiles6, by = "EUNISa_1") %>%
mutate(EUNISa_1 = as.factor(EUNISa_1)) %>%
filter(
(SAVI_pos_value >= percentile_20_SAVI_pos_value &
SAVI_pos_value <= percentile_80_SAVI_pos_value) &
(NDWI_min >= percentile_20_NDWI_min &
NDWI_min <= percentile_80_NDWI_min) &
(NDMI_min >= percentile_20_NDMI_min &
NDMI_min <= percentile_80_NDMI_min) &
(NDMI_max >= percentile_20_NDMI_max &
NDMI_max <= percentile_80_NDMI_max)
)
Split into training and test data sets.
set.seed(123)
train_indices7 <- sample(1:nrow(filtered_data7), 0.7 * nrow(filtered_data7))
train_data7 <- filtered_data7[train_indices7, ]
test_data7 <- filtered_data7[-train_indices7, ]
Number of points per category for filtered data:
filtered_data7 %>% count(EUNISa_1)
Confusion matrix:
confusionMatrix(predictions_rf7_S2, test_data7$EUNISa_1)
Variable Importance Plot
plot(varimp_rf7_S2$varimp, margin = c(9, 3, 2, 2))
plot(varimp_rf7_cond_S2$varimp, margin = c(9, 3, 2, 2))
ROC curves:
roc7 <- ggplot(roc_data7_S2$roc, aes(x = FPR, y = TPR, color = Class)) +
geom_line(size = 1.2) +
geom_abline(linetype = "dashed", color = "gray") +
labs(
title = "Multiclass ROC Curves with AUC",
x = "False Positive Rate",
y = "True Positive Rate",
color = "Class (AUC)"
) +
theme_minimal() +
theme(legend.position = "bottom")
roc6
ggplot(data_validation%>%
mutate(rules_broken = case_when(
valid_1_count == 1 & valid_1_NDWI == "wrong" ~ "NDWI",
valid_1_count == 1 & valid_1_CH == "wrong" ~ "CH",
valid_1_count == 2 &
valid_1_NDWI == "wrong" & valid_1_CH == "wrong"~ "NDWI + CH",
TRUE ~ NA_character_
)),
aes(x = valid_1_count, fill = rules_broken)) +
geom_bar() + labs(x = "Number of broken rules")
Proportion of observations not validated (so far):
nrow(data_validation %>% dplyr::filter(valid_1_count > 0))/
nrow(data_validation)
But be aware that there are still MANY missing RS data.
ggplot(data_validation %>%
mutate(diff_GPS = if_else(
Lctnmth != "Location with differential GPS" |
is.na(Lctnmth), "no", "yes")),
aes(x = diff_GPS, fill = valid_1)) +
geom_bar() + labs(x = "Differential GPS")
ggplot(data_validation %>%
mutate(GPS = case_when(
Lctnmth == "Location with differential GPS" ~ "yes",
Lctnmth == "Location with GPS" ~ "yes",
is.na(Lctnmth) ~ "no",
TRUE ~ "no"
)),
aes(x = GPS, fill = valid_1)) +
geom_bar() + labs(x = "GPS")
# Load world boundaries
world <- ne_countries(scale = "medium", returnclass = "sf")
# Calculate the extent of the points
points_GPS_extent <- data_validation %>%
dplyr::filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
dplyr::filter(S2_data == T | Landsat_data == T ) %>%
dplyr::filter(`Location method` == "Location with differential GPS" |
`Location method` == "Location with GPS") %>%
summarise(lon_min = min(Lon_updated, na.rm = TRUE),
lon_max = max(Lon_updated, na.rm = TRUE),
lat_min = min(Lat_updated, na.rm = TRUE),
lat_max = max(Lat_updated, na.rm = TRUE))
# Add padding to the extent (adjust as needed)
padding <- 2 # Adjust padding to your preference
x_limits <- c(points_GPS_extent$lon_min - padding,
points_GPS_extent$lon_max + padding)
y_limits <- c(points_GPS_extent$lat_min - padding,
points_GPS_extent$lat_max + padding)
# Create the zoomed map
ggplot() +
geom_sf(data = world, fill = "lightblue", color = "gray") +
geom_point(data = data_validation %>%
dplyr::filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
dplyr::filter(S2_data == T | Landsat_data == T ) %>%
dplyr::filter(`Location method` == "Location with differential GPS" |
`Location method` == "Location with GPS"),
aes(x = Lon_updated, y = Lat_updated, color = EUNISa_1),
size = 1) +
coord_sf(xlim = x_limits, ylim = y_limits) +
theme_minimal()
Number of GPS points by Country:
data_validation %>%
dplyr::filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
dplyr::filter(S2_data == T | Landsat_data == T ) %>%
dplyr::filter(`Location method` == "Location with differential GPS" |
`Location method` == "Location with GPS") %>%
count(Country)
# Calculate the extent of the points
points_resurvey_extent <- data_validation %>%
dplyr::filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
dplyr::filter(S2_data == T | Landsat_data == T ) %>%
summarise(lon_min = min(Lon_updated, na.rm = TRUE),
lon_max = max(Lon_updated, na.rm = TRUE),
lat_min = min(Lat_updated, na.rm = TRUE),
lat_max = max(Lat_updated, na.rm = TRUE))
# Add padding to the extent (adjust as needed)
padding <- 2 # Adjust padding to your preference
x_limits <- c(points_resurvey_extent$lon_min - padding,
points_resurvey_extent$lon_max + padding)
y_limits <- c(points_resurvey_extent$lat_min - padding,
points_resurvey_extent$lat_max + padding)
# Create the zoomed map
ggplot() +
geom_sf(data = world, fill = "lightblue", color = "gray") +
geom_point(data = data_validation %>%
dplyr::filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
dplyr::filter(S2_data == T | Landsat_data == T ),
aes(x = Lon_updated, y = Lat_updated, color = EUNISa_1),
size = 1) +
coord_sf(xlim = x_limits, ylim = y_limits) +
theme_minimal()
Number of ReSurvey points by Country:
data_validation %>%
dplyr::filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
dplyr::filter(S2_data == T | Landsat_data == T ) %>%
count(Country)
AlpineGrasslands_indices <- read_csv(
"C:/Data/MOTIVATE/Cordillera/AlpineGrasslands/AlpineGrassland_Sentinel_Plot_Allyear_Allmetrics.csv")
AlpineGrasslands_phen <- read_csv(
"C:/Data/MOTIVATE/Cordillera/AlpineGrasslands/AlpineGrasslands_Phenology_SOS_EOS_Peak_NDVI_Amplitude.csv")
AlpineGrasslands_CH <- read_csv(
"C:/Data/MOTIVATE/Cordillera/AlpineGrasslands/AlpineGrasslands_CanopyHeight_1m.csv")
VegetationTypes_indices <- read_csv(
"C:/Data/MOTIVATE/Cordillera/VegetationTypes/VegetationTypes_Sentinel_Plot_AllYear_Allmetrics.csv")
VegetationTypes_phen <- read_csv(
"C:/Data/MOTIVATE/Cordillera/VegetationTypes/VegetationTypes_Phenology_SOS_EOS_Peak_NDVI_Amplitude.csv")
VegetationTypes_CH <- read_csv(
"C:/Data/MOTIVATE/Cordillera/VegetationTypes/VegetationTypes_CanopyHeight_1m.csv")
AlpineGrasslands <- AlpineGrasslands_indices %>%
select(-`system:index`, -.geo, -Localidad) %>%
rename(Hábitat = "H�bitat") %>%
full_join(AlpineGrasslands_phen %>%
select(-`system:index`, -.geo, -Localidad) %>%
rename(Hábitat = "H�bitat")) %>%
full_join(AlpineGrasslands_CH %>%
select(-`system:index`, -.geo, -Localidad)) %>%
select(-Date__year, - `Precisi�n`) %>%
mutate(DATE = ymd(DATE)) %>%
rename(ID = "Releve_num") %>%
mutate(ID = as.character(ID)) %>%
mutate(layer = "AlpineGrasslands")
VegetationTypes <- VegetationTypes_indices %>%
select(-`system:index`, -.geo) %>%
full_join(VegetationTypes_phen %>%
select(-`system:index`, -.geo)) %>%
full_join(VegetationTypes_CH %>%
select(-`system:index`, -.geo)) %>%
rename(Hábitat = "TYPE") %>%
mutate(layer = "VegetationTypes")
Merge both datasets:
cordillera <- bind_rows(
AlpineGrasslands %>% select(DATE, ID, starts_with("NDMI"),
starts_with("NDVI"), Hábitat, "EOS_DOY",
"Peak_DOY", "SOS_DOY", "Season_Length",
"canopy_height", "layer"),
VegetationTypes %>% select(DATE, ID, starts_with("NDMI"),
starts_with("NDVI"), Hábitat, "EOS_DOY",
"Peak_DOY", "SOS_DOY", "Season_Length",
"canopy_height", "layer")
) %>%
mutate(EUNISa_1 = case_when(
Hábitat = str_detect(Hábitat, "Pastizal|Cervunal|grassland|meadow") ~ "R",
Hábitat = str_detect(Hábitat, "forest") ~ "T",
Hábitat = str_detect(Hábitat, "Scrub|scrub|Shrubland|shrubland|shrub|Heathland") ~ "S",
Hábitat = str_detect(Hábitat, "Suelo|Scree|scree|cliff") ~ "U",
Hábitat = is.na(Hábitat) ~ "R",
TRUE ~ NA_character_),
EUNISa_1_descr = case_when(
EUNISa_1 == "R" ~ "Grasslands",
EUNISa_1 == "T" ~ "Forests and other wooded land",
EUNISa_1 == "S" ~ "Heathlands, scrub and tundra",
EUNISa_1 == "U" ~ "Inland habitats with no or little soil")
)
distr_plot(cordillera,
c("NDVI_max", "NDVI_p90", "NDVI_min", "NDVI_p10"),
c("NDVI max", "NDVI p90", "NDVI min", "NDVI p10"))
distr_plot(cordillera,
c("NDMI_max", "NDMI_p90", "NDMI_min", "NDMI_p10"),
c("NDMI max", "NDMI p90", "NDMI min", "NDMI p10"))
sessionInfo()